diff --git a/client.cpp b/client.cpp
index 14def33..96109bb 100644
--- a/client.cpp
+++ b/client.cpp
@@ -96,6 +96,28 @@ static void segfault(int sig, siginfo_t* info, void* unused) {
     raise(SIGSEGV);
 }
 
+void maybe_copy_unified_arg(const int index, void* arg, enum cudaMemcpyKind kind)
+{
+    auto& unified_devices = conns[index].unified_devices;
+    auto found = unified_devices.find(arg);
+    if (found != unified_devices.end())
+    {
+        std::cout << "found unified arg pointer; copying..." << std::endl;
+
+        void* ptr = found->first;
+        size_t size = found->second;
+
+        cudaError_t res = cudaMemcpy(ptr, ptr, size, kind);
+
+        if (res != cudaSuccess) {
+            std::cerr << "cudaMemcpy failed: " << cudaGetErrorString(res) << std::endl;
+        } else {
+            std::cout << "Successfully copied " << size << " bytes" << std::endl;
+        }
+    }
+}
+
+
 static void set_segfault_handlers() {
     if (init > 0) {
         return;
diff --git a/codegen/codegen.py b/codegen/codegen.py
index 14d7b84..d4c84eb 100644
--- a/codegen/codegen.py
+++ b/codegen/codegen.py
@@ -104,6 +104,9 @@ def client_rpc_write(self, f):
             )
         )
 
+    def client_unified_copy(self, f, direction):
+        f.write("    maybe_copy_unified_arg(0, (void*){name}, cudaMemcpyDeviceToHost);\n".format(name=self.parameter.name, direction=direction))
+
     @property
     def server_declaration(self) -> str:
         c = self.ptr.ptr_to.const
@@ -209,7 +212,24 @@ def client_rpc_write(self, f):
                     length=length,
                 )
             )
-            
+
+    def client_unified_copy(self, f, direction):
+        f.write("    maybe_copy_unified_arg(0, (void*){name}, {direction});\n".format(name=self.parameter.name, direction=direction))
+
+        if isinstance(self.length, int):
+            f.write("    for (int i = 0; i < {name}; i++)\n".format(name=self.length))
+            f.write("       maybe_copy_unified_arg(0, (void*)&{name}[i], {direction});\n".format(name=self.parameter.name, direction=direction))
+        else:
+            if hasattr(self.length.type, "ptr_to"):
+                f.write("    for (int i = 0; i < static_cast<int>(*{name}); i++)\n".format(name=self.length.name))
+                f.write("       maybe_copy_unified_arg(0, (void*)&{name}[i], {direction});\n".format(name=self.parameter.name, direction=direction))
+            else:
+                if hasattr(self.parameter.type, "ptr_to"):
+                    f.write("    for (int i = 0; i < static_cast<int>({name}); i++)\n".format(name=self.length.name))
+                    f.write("       maybe_copy_unified_arg(0, (void*)&{name}[i], {direction});\n".format(name=self.parameter.name, direction=direction))
+                else:
+                    f.write("    for (int i = 0; i < static_cast<int>({name}); i++)\n".format(name=self.length.name))
+                    f.write("       maybe_copy_unified_arg(0, (void*){name}[i], {direction});\n".format(name=self.parameter.name, direction=direction))
 
     @property
     def server_declaration(self) -> str:
@@ -330,6 +350,9 @@ def client_rpc_write(self, f):
     def server_declaration(self) -> str:
         return f"    {self.ptr.format()} {self.parameter.name};\n" + \
                 f"    std::size_t {self.parameter.name}_len;\n"
+    
+    def client_unified_copy(self, f, direction):
+        f.write("    maybe_copy_unified_arg(0, (void*){name}, {direction});\n".format(name=self.parameter.name, direction=direction))
 
     def server_rpc_read(self, f, index) -> Optional[str]:
         if not self.send:
@@ -415,6 +438,12 @@ def server_declaration(self) -> str:
             return f"   {self.type_.format().replace("const", "")} {self.parameter.name};\n"
         else:
             return f"    {self.type_.format()} {self.parameter.name};\n"
+        
+    def client_unified_copy(self, f, direction):
+        if isinstance(self.type_, Pointer):
+            f.write("    maybe_copy_unified_arg(0, (void*){name}, {direction});\n".format(name=self.parameter.name, direction=direction))
+        else:
+            f.write("    maybe_copy_unified_arg(0, (void*)&{name}, {direction});\n".format(name=self.parameter.name, direction=direction))
 
     def server_rpc_read(self, f):
         if not self.send:
@@ -486,6 +515,9 @@ def server_rpc_read(self, f):
                 param_type=self.type_.ptr_to.format(),
             )
         )
+    
+    def client_unified_copy(self, f, direction):
+        f.write("    maybe_copy_unified_arg(0, (void*){name}, {direction});\n".format(name=self.parameter.name, direction=direction))
 
     @property
     def server_reference(self) -> str:
@@ -761,6 +793,7 @@ def main():
             "extern int rpc_wait_for_response(const int index);\n"
             "extern int rpc_read(const int index, void *data, const std::size_t size);\n"
             "extern int rpc_end_response(const int index, void *return_value);\n"
+            "void maybe_copy_unified_arg(const int index, void *arg, enum cudaMemcpyKind kind);\n"
             "extern int rpc_close();\n\n"
         )
         for function, annotation, operations, disabled in functions_with_annotations:
@@ -798,6 +831,9 @@ def main():
             )
             f.write("{\n")
 
+            for operation in operations:
+                operation.client_unified_copy(f, "cudaMemcpyHostToDevice")
+
             f.write(
                 "    {return_type} return_value;\n".format(
                     return_type=function.return_type.format()
@@ -841,12 +877,14 @@ def main():
                 )
             )
 
+            for operation in operations:
+                operation.client_unified_copy(f, "cudaMemcpyDeviceToHost")
+
             if function.name.format() == "nvmlShutdown":
                 f.write("    if (rpc_close() < 0)\n")
                 f.write("        return {error_return};\n".format(error_return=error_const(function.return_type.format())))
 
             f.write("    return return_value;\n")
-
             f.write("}\n\n")
 
         f.write("std::unordered_map<std::string, void *> functionMap = {\n")
diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
index 7b2dca9..ff768f4 100644
--- a/codegen/gen_client.cpp
+++ b/codegen/gen_client.cpp
@@ -19,7 +19,7 @@ extern int rpc_end_request(const int index);
 extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
-void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
+void maybe_copy_unified_arg(const int index, void *arg, enum cudaMemcpyKind kind);
 extern int rpc_close();
 
 nvmlReturn_t nvmlInit_v2()
@@ -34,12 +34,14 @@ nvmlReturn_t nvmlInit_v2()
 
 nvmlReturn_t nvmlInitWithFlags(unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlInitWithFlags) < 0 ||
         rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -57,6 +59,10 @@ nvmlReturn_t nvmlShutdown()
 
 nvmlReturn_t nvmlSystemGetDriverVersion(char* version, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlSystemGetDriverVersion) < 0 ||
         rpc_write(0, &length, sizeof(unsigned int)) < 0 ||
@@ -64,11 +70,19 @@ nvmlReturn_t nvmlSystemGetDriverVersion(char* version, unsigned int length)
         rpc_read(0, version, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlSystemGetNVMLVersion(char* version, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlSystemGetNVMLVersion) < 0 ||
         rpc_write(0, &length, sizeof(unsigned int)) < 0 ||
@@ -76,33 +90,46 @@ nvmlReturn_t nvmlSystemGetNVMLVersion(char* version, unsigned int length)
         rpc_read(0, version, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlSystemGetCudaDriverVersion(int* cudaDriverVersion)
 {
+    maybe_copy_unified_arg(0, (void*)cudaDriverVersion, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlSystemGetCudaDriverVersion) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, cudaDriverVersion, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)cudaDriverVersion, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlSystemGetCudaDriverVersion_v2(int* cudaDriverVersion)
 {
+    maybe_copy_unified_arg(0, (void*)cudaDriverVersion, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlSystemGetCudaDriverVersion_v2) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, cudaDriverVersion, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)cudaDriverVersion, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlSystemGetProcessName(unsigned int pid, char* name, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlSystemGetProcessName) < 0 ||
         rpc_write(0, &pid, sizeof(unsigned int)) < 0 ||
@@ -111,22 +138,31 @@ nvmlReturn_t nvmlSystemGetProcessName(unsigned int pid, char* name, unsigned int
         rpc_read(0, name, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitGetCount(unsigned int* unitCount)
 {
+    maybe_copy_unified_arg(0, (void*)unitCount, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitGetCount) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, unitCount, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)unitCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t* unit)
 {
+    maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)unit, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitGetHandleByIndex) < 0 ||
         rpc_write(0, &index, sizeof(unsigned int)) < 0 ||
@@ -134,11 +170,15 @@ nvmlReturn_t nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t* unit)
         rpc_read(0, unit, sizeof(nvmlUnit_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)unit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitGetUnitInfo) < 0 ||
         rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 ||
@@ -146,11 +186,15 @@ nvmlReturn_t nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t* info)
         rpc_read(0, info, sizeof(nvmlUnitInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t* state)
 {
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)state, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitGetLedState) < 0 ||
         rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 ||
@@ -158,11 +202,15 @@ nvmlReturn_t nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t* state)
         rpc_read(0, state, sizeof(nvmlLedState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)state, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t* psu)
 {
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)psu, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitGetPsuInfo) < 0 ||
         rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 ||
@@ -170,11 +218,16 @@ nvmlReturn_t nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t* psu)
         rpc_read(0, psu, sizeof(nvmlPSUInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)psu, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int* temp)
 {
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitGetTemperature) < 0 ||
         rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 ||
@@ -183,11 +236,16 @@ nvmlReturn_t nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned
         rpc_read(0, temp, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t* fanSpeeds)
 {
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fanSpeeds, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitGetFanSpeedInfo) < 0 ||
         rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 ||
@@ -195,11 +253,18 @@ nvmlReturn_t nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t* fanSp
         rpc_read(0, fanSpeeds, sizeof(nvmlUnitFanSpeeds_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fanSpeeds, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int* deviceCount, nvmlDevice_t* devices)
 {
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devices, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*deviceCount); i++)
+       maybe_copy_unified_arg(0, (void*)&devices[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitGetDevices) < 0 ||
         rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 ||
@@ -209,11 +274,20 @@ nvmlReturn_t nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int* deviceCount, nvml
         rpc_read(0, devices, *deviceCount * sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devices, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*deviceCount); i++)
+       maybe_copy_unified_arg(0, (void*)&devices[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlSystemGetHicVersion(unsigned int* hwbcCount, nvmlHwbcEntry_t* hwbcEntries)
 {
+    maybe_copy_unified_arg(0, (void*)hwbcCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)hwbcEntries, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*hwbcCount); i++)
+       maybe_copy_unified_arg(0, (void*)&hwbcEntries[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlSystemGetHicVersion) < 0 ||
         rpc_write(0, hwbcCount, sizeof(unsigned int)) < 0 ||
@@ -222,22 +296,30 @@ nvmlReturn_t nvmlSystemGetHicVersion(unsigned int* hwbcCount, nvmlHwbcEntry_t* h
         rpc_read(0, hwbcEntries, *hwbcCount * sizeof(nvmlHwbcEntry_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)hwbcCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)hwbcEntries, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*hwbcCount); i++)
+       maybe_copy_unified_arg(0, (void*)&hwbcEntries[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int* deviceCount)
 {
+    maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetCount_v2) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, deviceCount, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t* attributes)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetAttributes_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -245,11 +327,15 @@ nvmlReturn_t nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttribute
         rpc_read(0, attributes, sizeof(nvmlDeviceAttributes_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t* device)
 {
+    maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetHandleByIndex_v2) < 0 ||
         rpc_write(0, &index, sizeof(unsigned int)) < 0 ||
@@ -257,11 +343,15 @@ nvmlReturn_t nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t* dev
         rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetHandleBySerial(const char* serial, nvmlDevice_t* device)
 {
+    maybe_copy_unified_arg(0, (void*)serial, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     std::size_t serial_len = std::strlen(serial) + 1;
     if (rpc_start_request(0, RPC_nvmlDeviceGetHandleBySerial) < 0 ||
@@ -271,11 +361,15 @@ nvmlReturn_t nvmlDeviceGetHandleBySerial(const char* serial, nvmlDevice_t* devic
         rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)serial, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetHandleByUUID(const char* uuid, nvmlDevice_t* device)
 {
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     std::size_t uuid_len = std::strlen(uuid) + 1;
     if (rpc_start_request(0, RPC_nvmlDeviceGetHandleByUUID) < 0 ||
@@ -285,11 +379,15 @@ nvmlReturn_t nvmlDeviceGetHandleByUUID(const char* uuid, nvmlDevice_t* device)
         rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char* pciBusId, nvmlDevice_t* device)
 {
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     std::size_t pciBusId_len = std::strlen(pciBusId) + 1;
     if (rpc_start_request(0, RPC_nvmlDeviceGetHandleByPciBusId_v2) < 0 ||
@@ -299,11 +397,18 @@ nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char* pciBusId, nvmlDevice_t
         rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetName(nvmlDevice_t device, char* name, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetName) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -312,11 +417,18 @@ nvmlReturn_t nvmlDeviceGetName(nvmlDevice_t device, char* name, unsigned int len
         rpc_read(0, name, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)type, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetBrand) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -324,11 +436,15 @@ nvmlReturn_t nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type)
         rpc_read(0, type, sizeof(nvmlBrandType_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)type, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)index, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetIndex) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -336,11 +452,18 @@ nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index)
         rpc_read(0, index, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)index, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetSerial(nvmlDevice_t device, char* serial, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)serial, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&serial[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetSerial) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -349,11 +472,22 @@ nvmlReturn_t nvmlDeviceGetSerial(nvmlDevice_t device, char* serial, unsigned int
         rpc_read(0, serial, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)serial, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&serial[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long* nodeSet, nvmlAffinityScope_t scope)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nodeSetSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeSet, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(nodeSetSize); i++)
+       maybe_copy_unified_arg(0, (void*)&nodeSet[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryAffinity) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -363,11 +497,23 @@ nvmlReturn_t nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeS
         rpc_read(0, nodeSet, nodeSetSize * sizeof(unsigned long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nodeSetSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeSet, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(nodeSetSize); i++)
+       maybe_copy_unified_arg(0, (void*)&nodeSet[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long* cpuSet, nvmlAffinityScope_t scope)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&cpuSetSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)cpuSet, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(cpuSetSize); i++)
+       maybe_copy_unified_arg(0, (void*)&cpuSet[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetCpuAffinityWithinScope) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -377,11 +523,22 @@ nvmlReturn_t nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned i
         rpc_read(0, cpuSet, cpuSetSize * sizeof(unsigned long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&cpuSetSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)cpuSet, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(cpuSetSize); i++)
+       maybe_copy_unified_arg(0, (void*)&cpuSet[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long* cpuSet)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&cpuSetSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)cpuSet, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(cpuSetSize); i++)
+       maybe_copy_unified_arg(0, (void*)&cpuSet[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetCpuAffinity) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -390,33 +547,45 @@ nvmlReturn_t nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSi
         rpc_read(0, cpuSet, cpuSetSize * sizeof(unsigned long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&cpuSetSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)cpuSet, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(cpuSetSize); i++)
+       maybe_copy_unified_arg(0, (void*)&cpuSet[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetCpuAffinity(nvmlDevice_t device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetCpuAffinity) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceClearCpuAffinity(nvmlDevice_t device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceClearCpuAffinity) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t* pathInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pathInfo, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetTopologyCommonAncestor) < 0 ||
         rpc_write(0, &device1, sizeof(nvmlDevice_t)) < 0 ||
@@ -425,11 +594,20 @@ nvmlReturn_t nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevic
         rpc_read(0, pathInfo, sizeof(nvmlGpuTopologyLevel_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pathInfo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int* count, nvmlDevice_t* deviceArray)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)deviceArray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&deviceArray[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetTopologyNearestGpus) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -440,11 +618,22 @@ nvmlReturn_t nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopolo
         rpc_read(0, deviceArray, *count * sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)deviceArray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&deviceArray[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int* count, nvmlDevice_t* deviceArray)
 {
+    maybe_copy_unified_arg(0, (void*)&cpuNumber, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)deviceArray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&deviceArray[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlSystemGetTopologyGpuSet) < 0 ||
         rpc_write(0, &cpuNumber, sizeof(unsigned int)) < 0 ||
@@ -454,11 +643,20 @@ nvmlReturn_t nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int* c
         rpc_read(0, deviceArray, *count * sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&cpuNumber, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)deviceArray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&deviceArray[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus)
 {
+    maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&p2pIndex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)p2pStatus, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetP2PStatus) < 0 ||
         rpc_write(0, &device1, sizeof(nvmlDevice_t)) < 0 ||
@@ -468,11 +666,20 @@ nvmlReturn_t nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2,
         rpc_read(0, p2pStatus, sizeof(nvmlGpuP2PStatus_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&p2pIndex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)p2pStatus, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetUUID(nvmlDevice_t device, char* uuid, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetUUID) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -481,11 +688,21 @@ nvmlReturn_t nvmlDeviceGetUUID(nvmlDevice_t device, char* uuid, unsigned int len
         rpc_read(0, uuid, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char* mdevUuid, unsigned int size)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mdevUuid, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&mdevUuid[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetMdevUUID) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -494,11 +711,18 @@ nvmlReturn_t nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char*
         rpc_read(0, mdevUuid, size * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mdevUuid, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&mdevUuid[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)minorNumber, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMinorNumber) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -506,11 +730,18 @@ nvmlReturn_t nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNu
         rpc_read(0, minorNumber, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)minorNumber, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)partNumber, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&partNumber[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetBoardPartNumber) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -519,11 +750,22 @@ nvmlReturn_t nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber,
         rpc_read(0, partNumber, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)partNumber, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&partNumber[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char* version, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetInforomVersion) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -533,11 +775,22 @@ nvmlReturn_t nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_
         rpc_read(0, version, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char* version, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetInforomImageVersion) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -546,11 +799,18 @@ nvmlReturn_t nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char* version
         rpc_read(0, version, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int* checksum)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)checksum, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetInforomConfigurationChecksum) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -558,22 +818,28 @@ nvmlReturn_t nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsi
         rpc_read(0, checksum, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)checksum, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceValidateInforom(nvmlDevice_t device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceValidateInforom) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t* display)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)display, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDisplayMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -581,11 +847,15 @@ nvmlReturn_t nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t* di
         rpc_read(0, display, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)display, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t* isActive)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isActive, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDisplayActive) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -593,11 +863,15 @@ nvmlReturn_t nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t*
         rpc_read(0, isActive, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isActive, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t* mode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPersistenceMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -605,11 +879,15 @@ nvmlReturn_t nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t
         rpc_read(0, mode, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t* pci)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pci, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPciInfo_v3) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -617,11 +895,15 @@ nvmlReturn_t nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t* pci)
         rpc_read(0, pci, sizeof(nvmlPciInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pci, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int* maxLinkGen)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxLinkGen, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMaxPcieLinkGeneration) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -629,11 +911,15 @@ nvmlReturn_t nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned in
         rpc_read(0, maxLinkGen, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxLinkGen, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int* maxLinkGenDevice)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxLinkGenDevice, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuMaxPcieLinkGeneration) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -641,11 +927,15 @@ nvmlReturn_t nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned
         rpc_read(0, maxLinkGenDevice, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxLinkGenDevice, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int* maxLinkWidth)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxLinkWidth, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMaxPcieLinkWidth) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -653,11 +943,15 @@ nvmlReturn_t nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int* ma
         rpc_read(0, maxLinkWidth, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxLinkWidth, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int* currLinkGen)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)currLinkGen, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetCurrPcieLinkGeneration) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -665,11 +959,15 @@ nvmlReturn_t nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned i
         rpc_read(0, currLinkGen, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)currLinkGen, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int* currLinkWidth)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)currLinkWidth, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetCurrPcieLinkWidth) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -677,11 +975,16 @@ nvmlReturn_t nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int* c
         rpc_read(0, currLinkWidth, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)currLinkWidth, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int* value)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPcieThroughput) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -690,11 +993,16 @@ nvmlReturn_t nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounte
         rpc_read(0, value, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int* value)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPcieReplayCounter) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -702,11 +1010,16 @@ nvmlReturn_t nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int* v
         rpc_read(0, value, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int* clock)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clock, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetClockInfo) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -715,11 +1028,17 @@ nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, u
         rpc_read(0, clock, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clock, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int* clock)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clock, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMaxClockInfo) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -728,11 +1047,17 @@ nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type
         rpc_read(0, clock, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clock, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int* clockMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetApplicationsClock) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -741,11 +1066,17 @@ nvmlReturn_t nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t
         rpc_read(0, clockMHz, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int* clockMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDefaultApplicationsClock) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -754,22 +1085,31 @@ nvmlReturn_t nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlCloc
         rpc_read(0, clockMHz, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceResetApplicationsClocks(nvmlDevice_t device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceResetApplicationsClocks) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int* clockMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&clockId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetClock) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -779,11 +1119,18 @@ nvmlReturn_t nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType,
         rpc_read(0, clockMHz, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&clockId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int* clockMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMaxCustomerBoostClock) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -792,11 +1139,19 @@ nvmlReturn_t nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockTy
         rpc_read(0, clockMHz, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int* count, unsigned int* clocksMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clocksMHz, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&clocksMHz[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedMemoryClocks) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -806,11 +1161,22 @@ nvmlReturn_t nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned in
         rpc_read(0, clocksMHz, *count * sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clocksMHz, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&clocksMHz[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int* count, unsigned int* clocksMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&memoryClockMHz, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clocksMHz, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&clocksMHz[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedGraphicsClocks) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -821,11 +1187,20 @@ nvmlReturn_t nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned
         rpc_read(0, clocksMHz, *count * sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&memoryClockMHz, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clocksMHz, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&clocksMHz[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t* isEnabled, nvmlEnableState_t* defaultIsEnabled)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)defaultIsEnabled, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetAutoBoostedClocksEnabled) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -834,11 +1209,16 @@ nvmlReturn_t nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnab
         rpc_read(0, defaultIsEnabled, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)defaultIsEnabled, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&enabled, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetAutoBoostedClocksEnabled) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -846,11 +1226,16 @@ nvmlReturn_t nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnab
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&enabled, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&enabled, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetDefaultAutoBoostedClocksEnabled) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -859,11 +1244,16 @@ nvmlReturn_t nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, n
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&enabled, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int* speed)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)speed, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetFanSpeed) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -871,11 +1261,16 @@ nvmlReturn_t nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int* speed)
         rpc_read(0, speed, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)speed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int* speed)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)speed, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetFanSpeed_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -884,11 +1279,17 @@ nvmlReturn_t nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, uns
         rpc_read(0, speed, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)speed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int* targetSpeed)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)targetSpeed, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetTargetFanSpeed) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -897,11 +1298,16 @@ nvmlReturn_t nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan,
         rpc_read(0, targetSpeed, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)targetSpeed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetDefaultFanSpeed_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -909,11 +1315,16 @@ nvmlReturn_t nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int f
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int* minSpeed, unsigned int* maxSpeed)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)minSpeed, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxSpeed, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMinMaxFanSpeed) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -922,11 +1333,17 @@ nvmlReturn_t nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int* minS
         rpc_read(0, maxSpeed, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)minSpeed, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxSpeed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int fan, nvmlFanControlPolicy_t* policy)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)policy, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetFanControlPolicy_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -935,11 +1352,17 @@ nvmlReturn_t nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int
         rpc_read(0, policy, sizeof(nvmlFanControlPolicy_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)policy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan, nvmlFanControlPolicy_t policy)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&policy, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetFanControlPolicy) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -948,11 +1371,16 @@ nvmlReturn_t nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&policy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int* numFans)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numFans, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNumFans) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -960,11 +1388,16 @@ nvmlReturn_t nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int* numFans)
         rpc_read(0, numFans, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numFans, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int* temp)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&sensorType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetTemperature) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -973,11 +1406,17 @@ nvmlReturn_t nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensor
         rpc_read(0, temp, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&sensorType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int* temp)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&thresholdType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetTemperatureThreshold) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -986,11 +1425,17 @@ nvmlReturn_t nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperat
         rpc_read(0, temp, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&thresholdType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int* temp)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&thresholdType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetTemperatureThreshold) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1000,11 +1445,17 @@ nvmlReturn_t nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperat
         rpc_read(0, temp, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&thresholdType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sensorIndex, nvmlGpuThermalSettings_t* pThermalSettings)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&sensorIndex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pThermalSettings, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetThermalSettings) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1013,11 +1464,16 @@ nvmlReturn_t nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sens
         rpc_read(0, pThermalSettings, sizeof(nvmlGpuThermalSettings_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&sensorIndex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pThermalSettings, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t* pState)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pState, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPerformanceState) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1025,11 +1481,15 @@ nvmlReturn_t nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t* p
         rpc_read(0, pState, sizeof(nvmlPstates_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pState, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long* clocksThrottleReasons)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)clocksThrottleReasons, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetCurrentClocksThrottleReasons) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1037,11 +1497,15 @@ nvmlReturn_t nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsi
         rpc_read(0, clocksThrottleReasons, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)clocksThrottleReasons, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long* supportedClocksThrottleReasons)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)supportedClocksThrottleReasons, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedClocksThrottleReasons) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1049,11 +1513,15 @@ nvmlReturn_t nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, un
         rpc_read(0, supportedClocksThrottleReasons, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)supportedClocksThrottleReasons, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t* pState)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pState, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPowerState) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1061,11 +1529,15 @@ nvmlReturn_t nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t* pState)
         rpc_read(0, pState, sizeof(nvmlPstates_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pState, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t* mode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPowerManagementMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1073,11 +1545,15 @@ nvmlReturn_t nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableSta
         rpc_read(0, mode, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int* limit)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)limit, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPowerManagementLimit) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1085,11 +1561,16 @@ nvmlReturn_t nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int
         rpc_read(0, limit, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)limit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int* minLimit, unsigned int* maxLimit)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)minLimit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxLimit, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPowerManagementLimitConstraints) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1098,11 +1579,16 @@ nvmlReturn_t nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, u
         rpc_read(0, maxLimit, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)minLimit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxLimit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int* defaultLimit)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)defaultLimit, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPowerManagementDefaultLimit) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1110,11 +1596,15 @@ nvmlReturn_t nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsig
         rpc_read(0, defaultLimit, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)defaultLimit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int* power)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)power, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPowerUsage) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1122,11 +1612,15 @@ nvmlReturn_t nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int* power)
         rpc_read(0, power, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)power, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long* energy)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)energy, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetTotalEnergyConsumption) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1134,11 +1628,15 @@ nvmlReturn_t nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned l
         rpc_read(0, energy, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)energy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int* limit)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)limit, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetEnforcedPowerLimit) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1146,11 +1644,16 @@ nvmlReturn_t nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int*
         rpc_read(0, limit, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)limit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t* current, nvmlGpuOperationMode_t* pending)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)current, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuOperationMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1159,11 +1662,16 @@ nvmlReturn_t nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperation
         rpc_read(0, pending, sizeof(nvmlGpuOperationMode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)current, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t* memory)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)memory, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryInfo) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1171,11 +1679,15 @@ nvmlReturn_t nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t* memory)
         rpc_read(0, memory, sizeof(nvmlMemory_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)memory, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t* memory)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)memory, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryInfo_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1183,11 +1695,15 @@ nvmlReturn_t nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t* me
         rpc_read(0, memory, sizeof(nvmlMemory_v2_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)memory, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t* mode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetComputeMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1195,11 +1711,16 @@ nvmlReturn_t nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t* mo
         rpc_read(0, mode, sizeof(nvmlComputeMode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)major, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)minor, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetCudaComputeCapability) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1208,11 +1729,17 @@ nvmlReturn_t nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major,
         rpc_read(0, minor, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)major, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)minor, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t* current, nvmlEnableState_t* pending)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)current, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetEccMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1221,11 +1748,16 @@ nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t* curren
         rpc_read(0, pending, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)current, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t* defaultMode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)defaultMode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDefaultEccMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1233,11 +1765,15 @@ nvmlReturn_t nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t*
         rpc_read(0, defaultMode, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)defaultMode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int* boardId)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)boardId, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetBoardId) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1245,11 +1781,15 @@ nvmlReturn_t nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int* boardId)
         rpc_read(0, boardId, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)boardId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int* multiGpuBool)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)multiGpuBool, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMultiGpuBoard) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1257,11 +1797,17 @@ nvmlReturn_t nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int* multi
         rpc_read(0, multiGpuBool, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)multiGpuBool, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long* eccCounts)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)eccCounts, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetTotalEccErrors) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1271,11 +1817,19 @@ nvmlReturn_t nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorTyp
         rpc_read(0, eccCounts, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)eccCounts, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t* eccCounts)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)eccCounts, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDetailedEccErrors) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1285,11 +1839,20 @@ nvmlReturn_t nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryError
         rpc_read(0, eccCounts, sizeof(nvmlEccErrorCounts_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)eccCounts, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlMemoryLocation_t locationType, unsigned long long* count)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&locationType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryErrorCounter) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1300,11 +1863,18 @@ nvmlReturn_t nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErro
         rpc_read(0, count, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&locationType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t* utilization)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetUtilizationRates) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1312,11 +1882,16 @@ nvmlReturn_t nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_
         rpc_read(0, utilization, sizeof(nvmlUtilization_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int* utilization, unsigned int* samplingPeriodUs)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)samplingPeriodUs, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetEncoderUtilization) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1325,11 +1900,17 @@ nvmlReturn_t nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int*
         rpc_read(0, samplingPeriodUs, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)samplingPeriodUs, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetEncoderCapacity(nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int* encoderCapacity)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&encoderQueryType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)encoderCapacity, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetEncoderCapacity) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1338,11 +1919,18 @@ nvmlReturn_t nvmlDeviceGetEncoderCapacity(nvmlDevice_t device, nvmlEncoderType_t
         rpc_read(0, encoderCapacity, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&encoderQueryType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)encoderCapacity, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetEncoderStats(nvmlDevice_t device, unsigned int* sessionCount, unsigned int* averageFps, unsigned int* averageLatency)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)averageFps, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)averageLatency, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetEncoderStats) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1352,11 +1940,20 @@ nvmlReturn_t nvmlDeviceGetEncoderStats(nvmlDevice_t device, unsigned int* sessio
         rpc_read(0, averageLatency, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)averageFps, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)averageLatency, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int* sessionCount, nvmlEncoderSessionInfo_t* sessionInfos)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionInfos, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*sessionCount); i++)
+       maybe_copy_unified_arg(0, (void*)&sessionInfos[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetEncoderSessions) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1366,11 +1963,19 @@ nvmlReturn_t nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int* ses
         rpc_read(0, sessionInfos, *sessionCount * sizeof(nvmlEncoderSessionInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionInfos, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*sessionCount); i++)
+       maybe_copy_unified_arg(0, (void*)&sessionInfos[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int* utilization, unsigned int* samplingPeriodUs)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)samplingPeriodUs, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDecoderUtilization) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1379,11 +1984,16 @@ nvmlReturn_t nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int*
         rpc_read(0, samplingPeriodUs, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)samplingPeriodUs, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t* fbcStats)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fbcStats, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetFBCStats) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1391,11 +2001,18 @@ nvmlReturn_t nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t* fbcStats
         rpc_read(0, fbcStats, sizeof(nvmlFBCStats_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fbcStats, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int* sessionCount, nvmlFBCSessionInfo_t* sessionInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*sessionCount); i++)
+       maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetFBCSessions) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1405,11 +2022,19 @@ nvmlReturn_t nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int* session
         rpc_read(0, sessionInfo, *sessionCount * sizeof(nvmlFBCSessionInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*sessionCount); i++)
+       maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t* current, nvmlDriverModel_t* pending)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)current, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDriverModel) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1418,11 +2043,19 @@ nvmlReturn_t nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t* cu
         rpc_read(0, pending, sizeof(nvmlDriverModel_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)current, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char* version, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVbiosVersion) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1431,11 +2064,18 @@ nvmlReturn_t nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char* version, unsig
         rpc_read(0, version, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t* bridgeHierarchy)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bridgeHierarchy, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetBridgeChipInfo) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1443,11 +2083,18 @@ nvmlReturn_t nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHier
         rpc_read(0, bridgeHierarchy, sizeof(nvmlBridgeChipHierarchy_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bridgeHierarchy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*infoCount); i++)
+       maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetComputeRunningProcesses_v3) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1457,11 +2104,21 @@ nvmlReturn_t nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsign
         rpc_read(0, infos, *infoCount * sizeof(nvmlProcessInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*infoCount); i++)
+       maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*infoCount); i++)
+       maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGraphicsRunningProcesses_v3) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1471,11 +2128,21 @@ nvmlReturn_t nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsig
         rpc_read(0, infos, *infoCount * sizeof(nvmlProcessInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*infoCount); i++)
+       maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*infoCount); i++)
+       maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMPSComputeRunningProcesses_v3) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1485,11 +2152,19 @@ nvmlReturn_t nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, uns
         rpc_read(0, infos, *infoCount * sizeof(nvmlProcessInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*infoCount); i++)
+       maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int* onSameBoard)
 {
+    maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)onSameBoard, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceOnSameBoard) < 0 ||
         rpc_write(0, &device1, sizeof(nvmlDevice_t)) < 0 ||
@@ -1498,11 +2173,17 @@ nvmlReturn_t nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, i
         rpc_read(0, onSameBoard, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)onSameBoard, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t* isRestricted)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&apiType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isRestricted, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetAPIRestriction) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1511,11 +2192,22 @@ nvmlReturn_t nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_
         rpc_read(0, isRestricted, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&apiType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isRestricted, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, nvmlValueType_t* sampleValType, unsigned int* sampleCount, nvmlSample_t* samples)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sampleValType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sampleCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)samples, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*sampleCount); i++)
+       maybe_copy_unified_arg(0, (void*)&samples[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetSamples) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1528,11 +2220,21 @@ nvmlReturn_t nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type,
         rpc_read(0, samples, *sampleCount * sizeof(nvmlSample_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sampleValType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sampleCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)samples, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*sampleCount); i++)
+       maybe_copy_unified_arg(0, (void*)&samples[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t* bar1Memory)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bar1Memory, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetBAR1MemoryInfo) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1540,11 +2242,16 @@ nvmlReturn_t nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t*
         rpc_read(0, bar1Memory, sizeof(nvmlBAR1Memory_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bar1Memory, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t* violTime)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&perfPolicyType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)violTime, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetViolationStatus) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1553,11 +2260,16 @@ nvmlReturn_t nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyTyp
         rpc_read(0, violTime, sizeof(nvmlViolationTime_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&perfPolicyType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)violTime, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int* irqNum)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)irqNum, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetIrqNum) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1565,11 +2277,15 @@ nvmlReturn_t nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int* irqNum)
         rpc_read(0, irqNum, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)irqNum, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int* numCores)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numCores, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNumGpuCores) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1577,11 +2293,15 @@ nvmlReturn_t nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int* numCore
         rpc_read(0, numCores, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numCores, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t* powerSource)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)powerSource, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPowerSource) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1589,11 +2309,15 @@ nvmlReturn_t nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t* po
         rpc_read(0, powerSource, sizeof(nvmlPowerSource_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)powerSource, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int* busWidth)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)busWidth, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryBusWidth) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1601,11 +2325,15 @@ nvmlReturn_t nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int* busW
         rpc_read(0, busWidth, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)busWidth, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int* maxSpeed)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxSpeed, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPcieLinkMaxSpeed) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1613,11 +2341,15 @@ nvmlReturn_t nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int* ma
         rpc_read(0, maxSpeed, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxSpeed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int* pcieSpeed)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pcieSpeed, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPcieSpeed) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1625,11 +2357,15 @@ nvmlReturn_t nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int* pcieSpeed
         rpc_read(0, pcieSpeed, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pcieSpeed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int* adaptiveClockStatus)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)adaptiveClockStatus, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetAdaptiveClockInfoStatus) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1637,11 +2373,15 @@ nvmlReturn_t nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned
         rpc_read(0, adaptiveClockStatus, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)adaptiveClockStatus, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t* mode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetAccountingMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1649,11 +2389,16 @@ nvmlReturn_t nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t*
         rpc_read(0, mode, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t* stats)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)stats, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetAccountingStats) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1662,11 +2407,19 @@ nvmlReturn_t nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid,
         rpc_read(0, stats, sizeof(nvmlAccountingStats_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)stats, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int* count, unsigned int* pids)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pids, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&pids[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetAccountingPids) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1676,11 +2429,18 @@ nvmlReturn_t nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int* coun
         rpc_read(0, pids, *count * sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pids, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&pids[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int* bufferSize)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetAccountingBufferSize) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1688,11 +2448,19 @@ nvmlReturn_t nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int
         rpc_read(0, bufferSize, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, unsigned int* pageCount, unsigned long long* addresses)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&cause, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pageCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)addresses, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*pageCount); i++)
+       maybe_copy_unified_arg(0, (void*)&addresses[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetRetiredPages) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1703,11 +2471,26 @@ nvmlReturn_t nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCa
         rpc_read(0, addresses, *pageCount * sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&cause, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pageCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)addresses, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*pageCount); i++)
+       maybe_copy_unified_arg(0, (void*)&addresses[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, unsigned int* pageCount, unsigned long long* addresses, unsigned long long* timestamps)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&cause, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pageCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)addresses, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*pageCount); i++)
+       maybe_copy_unified_arg(0, (void*)&addresses[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)timestamps, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*pageCount); i++)
+       maybe_copy_unified_arg(0, (void*)&timestamps[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetRetiredPages_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1719,11 +2502,22 @@ nvmlReturn_t nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetiremen
         rpc_read(0, timestamps, *pageCount * sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&cause, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pageCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)addresses, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*pageCount); i++)
+       maybe_copy_unified_arg(0, (void*)&addresses[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)timestamps, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*pageCount); i++)
+       maybe_copy_unified_arg(0, (void*)&timestamps[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t* isPending)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isPending, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetRetiredPagesPendingStatus) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1731,11 +2525,18 @@ nvmlReturn_t nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEna
         rpc_read(0, isPending, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isPending, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int* corrRows, unsigned int* uncRows, unsigned int* isPending, unsigned int* failureOccurred)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)corrRows, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)uncRows, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isPending, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)failureOccurred, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetRemappedRows) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1746,11 +2547,18 @@ nvmlReturn_t nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int* corrRo
         rpc_read(0, failureOccurred, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)corrRows, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)uncRows, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isPending, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)failureOccurred, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t* values)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)values, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetRowRemapperHistogram) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1758,11 +2566,15 @@ nvmlReturn_t nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemap
         rpc_read(0, values, sizeof(nvmlRowRemapperHistogramValues_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)values, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t* arch)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)arch, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetArchitecture) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1770,11 +2582,15 @@ nvmlReturn_t nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitectu
         rpc_read(0, arch, sizeof(nvmlDeviceArchitecture_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)arch, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color)
 {
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&color, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlUnitSetLedState) < 0 ||
         rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 ||
@@ -1782,11 +2598,15 @@ nvmlReturn_t nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&color, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetPersistenceMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1794,11 +2614,15 @@ nvmlReturn_t nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetComputeMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1806,11 +2630,15 @@ nvmlReturn_t nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mod
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ecc, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetEccMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1818,11 +2646,15 @@ nvmlReturn_t nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ecc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceClearEccErrorCounts) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1830,11 +2662,16 @@ nvmlReturn_t nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterTy
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&driverModel, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetDriverModel) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1843,11 +2680,17 @@ nvmlReturn_t nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t dri
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&driverModel, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&minGpuClockMHz, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&maxGpuClockMHz, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetGpuLockedClocks) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1856,22 +2699,30 @@ nvmlReturn_t nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minG
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&minGpuClockMHz, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&maxGpuClockMHz, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceResetGpuLockedClocks) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&minMemClockMHz, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&maxMemClockMHz, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetMemoryLockedClocks) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1880,22 +2731,30 @@ nvmlReturn_t nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int m
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&minMemClockMHz, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&maxMemClockMHz, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceResetMemoryLockedClocks) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&memClockMHz, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graphicsClockMHz, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetApplicationsClocks) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1904,11 +2763,16 @@ nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int m
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&memClockMHz, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graphicsClockMHz, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t* status)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)status, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetClkMonStatus) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1916,11 +2780,15 @@ nvmlReturn_t nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t*
         rpc_read(0, status, sizeof(nvmlClkMonStatus_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)status, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetPowerManagementLimit) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1928,11 +2796,15 @@ nvmlReturn_t nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetGpuOperationMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1940,11 +2812,16 @@ nvmlReturn_t nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperation
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&apiType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&isRestricted, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetAPIRestriction) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1953,11 +2830,16 @@ nvmlReturn_t nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&apiType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&isRestricted, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetAccountingMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1965,22 +2847,29 @@ nvmlReturn_t nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceClearAccountingPids(nvmlDevice_t device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceClearAccountingPids) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isActive, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkState) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -1989,11 +2878,17 @@ nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nv
         rpc_read(0, isActive, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isActive, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int* version)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkVersion) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2002,11 +2897,18 @@ nvmlReturn_t nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link,
         rpc_read(0, version, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkCapability) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2016,11 +2918,18 @@ nvmlReturn_t nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int lin
         rpc_read(0, capResult, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pci, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkRemotePciInfo_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2029,11 +2938,18 @@ nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned i
         rpc_read(0, pci, sizeof(nvmlPciInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pci, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, nvmlNvLinkErrorCounter_t counter, unsigned long long* counterValue)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)counterValue, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkErrorCounter) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2043,11 +2959,17 @@ nvmlReturn_t nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int l
         rpc_read(0, counterValue, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)counterValue, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceResetNvLinkErrorCounters) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2055,11 +2977,18 @@ nvmlReturn_t nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned in
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t* control, unsigned int reset)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)control, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reset, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetNvLinkUtilizationControl) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2070,11 +2999,20 @@ nvmlReturn_t nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)control, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reset, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t* control)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)control, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkUtilizationControl) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2084,11 +3022,20 @@ nvmlReturn_t nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned
         rpc_read(0, control, sizeof(nvmlNvLinkUtilizationControl_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)control, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, unsigned long long* rxcounter, unsigned long long* txcounter)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)rxcounter, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)txcounter, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkUtilizationCounter) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2099,11 +3046,20 @@ nvmlReturn_t nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned
         rpc_read(0, txcounter, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)rxcounter, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)txcounter, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceFreezeNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlEnableState_t freeze)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&freeze, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceFreezeNvLinkUtilizationCounter) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2113,11 +3069,18 @@ nvmlReturn_t nvmlDeviceFreezeNvLinkUtilizationCounter(nvmlDevice_t device, unsig
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&freeze, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceResetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceResetNvLinkUtilizationCounter) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2126,11 +3089,17 @@ nvmlReturn_t nvmlDeviceResetNvLinkUtilizationCounter(nvmlDevice_t device, unsign
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t* pNvLinkDeviceType)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNvLinkDeviceType, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkRemoteDeviceType) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2139,22 +3108,30 @@ nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned i
         rpc_read(0, pNvLinkDeviceType, sizeof(nvmlIntNvLinkDeviceType_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNvLinkDeviceType, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlEventSetCreate(nvmlEventSet_t* set)
 {
+    maybe_copy_unified_arg(0, (void*)set, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlEventSetCreate) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, set, sizeof(nvmlEventSet_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)set, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&eventTypes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceRegisterEvents) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2163,11 +3140,16 @@ nvmlReturn_t nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long ev
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&eventTypes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long* eventTypes)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)eventTypes, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedEventTypes) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2175,11 +3157,16 @@ nvmlReturn_t nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long
         rpc_read(0, eventTypes, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)eventTypes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t* data, unsigned int timeoutms)
 {
+    maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)data, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&timeoutms, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlEventSetWait_v2) < 0 ||
         rpc_write(0, &set, sizeof(nvmlEventSet_t)) < 0 ||
@@ -2188,22 +3175,29 @@ nvmlReturn_t nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t* data, unsi
         rpc_read(0, data, sizeof(nvmlEventData_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)data, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&timeoutms, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlEventSetFree(nvmlEventSet_t set)
 {
+    maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlEventSetFree) < 0 ||
         rpc_write(0, &set, sizeof(nvmlEventSet_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceModifyDrainState(nvmlPciInfo_t* pciInfo, nvmlEnableState_t newState)
 {
+    maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&newState, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceModifyDrainState) < 0 ||
         rpc_write(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 ||
@@ -2212,11 +3206,15 @@ nvmlReturn_t nvmlDeviceModifyDrainState(nvmlPciInfo_t* pciInfo, nvmlEnableState_
         rpc_read(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&newState, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceQueryDrainState(nvmlPciInfo_t* pciInfo, nvmlEnableState_t* currentState)
 {
+    maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)currentState, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceQueryDrainState) < 0 ||
         rpc_write(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 ||
@@ -2225,11 +3223,16 @@ nvmlReturn_t nvmlDeviceQueryDrainState(nvmlPciInfo_t* pciInfo, nvmlEnableState_t
         rpc_read(0, currentState, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)currentState, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t* pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState)
 {
+    maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gpuState, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&linkState, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceRemoveGpu_v2) < 0 ||
         rpc_write(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 ||
@@ -2239,11 +3242,15 @@ nvmlReturn_t nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t* pciInfo, nvmlDetachGpuState_t
         rpc_read(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gpuState, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&linkState, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceDiscoverGpus(nvmlPciInfo_t* pciInfo)
 {
+    maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceDiscoverGpus) < 0 ||
         rpc_write(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 ||
@@ -2251,11 +3258,17 @@ nvmlReturn_t nvmlDeviceDiscoverGpus(nvmlPciInfo_t* pciInfo)
         rpc_read(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&valuesCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)values, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(valuesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&values[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetFieldValues) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2264,11 +3277,21 @@ nvmlReturn_t nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvml
         rpc_read(0, values, valuesCount * sizeof(nvmlFieldValue_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&valuesCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)values, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(valuesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&values[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&valuesCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)values, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(valuesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&values[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceClearFieldValues) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2277,11 +3300,18 @@ nvmlReturn_t nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nv
         rpc_read(0, values, valuesCount * sizeof(nvmlFieldValue_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&valuesCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)values, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(valuesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&values[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t* pVirtualMode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pVirtualMode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVirtualizationMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2289,11 +3319,15 @@ nvmlReturn_t nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtual
         rpc_read(0, pVirtualMode, sizeof(nvmlGpuVirtualizationMode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pVirtualMode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t* pHostVgpuMode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pHostVgpuMode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetHostVgpuMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2301,11 +3335,15 @@ nvmlReturn_t nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t*
         rpc_read(0, pHostVgpuMode, sizeof(nvmlHostVgpuMode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pHostVgpuMode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&virtualMode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetVirtualizationMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2313,11 +3351,15 @@ nvmlReturn_t nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtual
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&virtualMode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t* pGridLicensableFeatures)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pGridLicensableFeatures, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGridLicensableFeatures_v4) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2325,11 +3367,19 @@ nvmlReturn_t nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGri
         rpc_read(0, pGridLicensableFeatures, sizeof(nvmlGridLicensableFeatures_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pGridLicensableFeatures, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t* utilization, unsigned int* processSamplesCount, unsigned long long lastSeenTimeStamp)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)processSamplesCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*processSamplesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&utilization[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetProcessUtilization) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2340,11 +3390,19 @@ nvmlReturn_t nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUti
         rpc_read(0, utilization, *processSamplesCount * sizeof(nvmlProcessUtilizationSample_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)processSamplesCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*processSamplesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&utilization[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char* version)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGspFirmwareVersion) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2352,11 +3410,16 @@ nvmlReturn_t nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char* version)
         rpc_read(0, version, sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int* isEnabled, unsigned int* defaultMode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)defaultMode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGspFirmwareMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2365,11 +3428,16 @@ nvmlReturn_t nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int* isE
         rpc_read(0, defaultMode, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)defaultMode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability, unsigned int* capResult)
 {
+    maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGetVgpuDriverCapabilities) < 0 ||
         rpc_write(0, &capability, sizeof(nvmlVgpuDriverCapability_t)) < 0 ||
@@ -2377,11 +3445,16 @@ nvmlReturn_t nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability
         rpc_read(0, capResult, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, unsigned int* capResult)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuCapabilities) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2390,11 +3463,19 @@ nvmlReturn_t nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCa
         rpc_read(0, capResult, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int* vgpuCount, nvmlVgpuTypeId_t* vgpuTypeIds)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeIds, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*vgpuCount); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeIds[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedVgpus) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2404,11 +3485,21 @@ nvmlReturn_t nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int* vgpu
         rpc_read(0, vgpuTypeIds, *vgpuCount * sizeof(nvmlVgpuTypeId_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeIds, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*vgpuCount); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeIds[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int* vgpuCount, nvmlVgpuTypeId_t* vgpuTypeIds)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeIds, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*vgpuCount); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeIds[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetCreatableVgpus) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2418,11 +3509,21 @@ nvmlReturn_t nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int* vgpu
         rpc_read(0, vgpuTypeIds, *vgpuCount * sizeof(nvmlVgpuTypeId_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeIds, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*vgpuCount); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeIds[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeClass, unsigned int* size)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeClass, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*size); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeClass[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetClass) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2431,11 +3532,21 @@ nvmlReturn_t nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeCla
         rpc_read(0, vgpuTypeClass, *size * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeClass, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*size); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeClass[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeName, unsigned int* size)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeName, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*size); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeName[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetName) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2445,11 +3556,18 @@ nvmlReturn_t nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeName
         rpc_read(0, vgpuTypeName, *size * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeName, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*size); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeName[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int* gpuInstanceProfileId)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)gpuInstanceProfileId, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetGpuInstanceProfileId) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2457,11 +3575,16 @@ nvmlReturn_t nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, un
         rpc_read(0, gpuInstanceProfileId, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)gpuInstanceProfileId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long* deviceID, unsigned long long* subsystemID)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)deviceID, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)subsystemID, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetDeviceID) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2470,11 +3593,16 @@ nvmlReturn_t nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long
         rpc_read(0, subsystemID, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)deviceID, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)subsystemID, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long* fbSize)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fbSize, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetFramebufferSize) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2482,11 +3610,15 @@ nvmlReturn_t nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigne
         rpc_read(0, fbSize, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fbSize, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int* numDisplayHeads)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numDisplayHeads, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetNumDisplayHeads) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2494,11 +3626,17 @@ nvmlReturn_t nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigne
         rpc_read(0, numDisplayHeads, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numDisplayHeads, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int* xdim, unsigned int* ydim)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&displayIndex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)xdim, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)ydim, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetResolution) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2508,11 +3646,20 @@ nvmlReturn_t nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int
         rpc_read(0, ydim, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&displayIndex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)xdim, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)ydim, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeLicenseString, unsigned int size)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeLicenseString, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeLicenseString[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetLicense) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2521,11 +3668,18 @@ nvmlReturn_t nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeL
         rpc_read(0, vgpuTypeLicenseString, size * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeLicenseString, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuTypeLicenseString[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int* frameRateLimit)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)frameRateLimit, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetFrameRateLimit) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2533,11 +3687,16 @@ nvmlReturn_t nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned
         rpc_read(0, frameRateLimit, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)frameRateLimit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int* vgpuInstanceCount)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuInstanceCount, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetMaxInstances) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2546,11 +3705,16 @@ nvmlReturn_t nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t v
         rpc_read(0, vgpuInstanceCount, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuInstanceCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int* vgpuInstanceCountPerVm)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuInstanceCountPerVm, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetMaxInstancesPerVm) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2558,11 +3722,18 @@ nvmlReturn_t nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsig
         rpc_read(0, vgpuInstanceCountPerVm, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuInstanceCountPerVm, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int* vgpuCount, nvmlVgpuInstance_t* vgpuInstances)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuInstances, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*vgpuCount); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuInstances[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetActiveVgpus) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2572,11 +3743,22 @@ nvmlReturn_t nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int* vgpuCou
         rpc_read(0, vgpuInstances, *vgpuCount * sizeof(nvmlVgpuInstance_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuInstances, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*vgpuCount); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuInstances[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char* vmId, unsigned int size, nvmlVgpuVmIdType_t* vmIdType)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vmId, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&vmId[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vmIdType, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetVmID) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2586,11 +3768,22 @@ nvmlReturn_t nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char* vmId
         rpc_read(0, vmIdType, sizeof(nvmlVgpuVmIdType_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vmId, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&vmId[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vmIdType, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char* uuid, unsigned int size)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetUUID) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2599,11 +3792,21 @@ nvmlReturn_t nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char* uuid
         rpc_read(0, uuid, size * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetVmDriverVersion) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2612,11 +3815,18 @@ nvmlReturn_t nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance,
         rpc_read(0, version, length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(length); i++)
+       maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long* fbUsage)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fbUsage, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetFbUsage) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2624,11 +3834,15 @@ nvmlReturn_t nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigne
         rpc_read(0, fbUsage, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fbUsage, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int* licensed)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)licensed, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetLicenseStatus) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2636,11 +3850,15 @@ nvmlReturn_t nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, u
         rpc_read(0, licensed, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)licensed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t* vgpuTypeId)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeId, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetType) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2648,11 +3866,15 @@ nvmlReturn_t nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTy
         rpc_read(0, vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuTypeId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int* frameRateLimit)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)frameRateLimit, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetFrameRateLimit) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2660,11 +3882,15 @@ nvmlReturn_t nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance,
         rpc_read(0, frameRateLimit, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)frameRateLimit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t* eccMode)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)eccMode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetEccMode) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2672,11 +3898,15 @@ nvmlReturn_t nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEna
         rpc_read(0, eccMode, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)eccMode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int* encoderCapacity)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)encoderCapacity, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetEncoderCapacity) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2684,11 +3914,15 @@ nvmlReturn_t nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance,
         rpc_read(0, encoderCapacity, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)encoderCapacity, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&encoderCapacity, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceSetEncoderCapacity) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2696,11 +3930,17 @@ nvmlReturn_t nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&encoderCapacity, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int* sessionCount, unsigned int* averageFps, unsigned int* averageLatency)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)averageFps, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)averageLatency, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetEncoderStats) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2710,11 +3950,20 @@ nvmlReturn_t nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, un
         rpc_read(0, averageLatency, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)averageFps, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)averageLatency, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int* sessionCount, nvmlEncoderSessionInfo_t* sessionInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*sessionCount); i++)
+       maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetEncoderSessions) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2724,11 +3973,18 @@ nvmlReturn_t nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance,
         rpc_read(0, sessionInfo, *sessionCount * sizeof(nvmlEncoderSessionInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*sessionCount); i++)
+       maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t* fbcStats)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fbcStats, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetFBCStats) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2736,11 +3992,18 @@ nvmlReturn_t nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFB
         rpc_read(0, fbcStats, sizeof(nvmlFBCStats_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fbcStats, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int* sessionCount, nvmlFBCSessionInfo_t* sessionInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*sessionCount); i++)
+       maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetFBCSessions) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2750,11 +4013,18 @@ nvmlReturn_t nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, uns
         rpc_read(0, sessionInfo, *sessionCount * sizeof(nvmlFBCSessionInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*sessionCount); i++)
+       maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, unsigned int* gpuInstanceId)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)gpuInstanceId, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetGpuInstanceId) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2762,11 +4032,18 @@ nvmlReturn_t nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, u
         rpc_read(0, gpuInstanceId, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)gpuInstanceId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char* vgpuPciId, unsigned int* length)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuPciId, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*length); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuPciId[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetGpuPciId) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2776,11 +4053,19 @@ nvmlReturn_t nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char*
         rpc_read(0, vgpuPciId, *length * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuPciId, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*length); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuPciId[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int* capResult)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuTypeGetCapabilities) < 0 ||
         rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 ||
@@ -2789,11 +4074,19 @@ nvmlReturn_t nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCa
         rpc_read(0, capResult, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t* vgpuMetadata, unsigned int* bufferSize)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuMetadata, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*bufferSize); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuMetadata[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetMetadata) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2803,11 +4096,21 @@ nvmlReturn_t nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVg
         rpc_read(0, vgpuMetadata, *bufferSize * sizeof(nvmlVgpuMetadata_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuMetadata, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*bufferSize); i++)
+       maybe_copy_unified_arg(0, (void*)&vgpuMetadata[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t* pgpuMetadata, unsigned int* bufferSize)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*bufferSize); i++)
+       maybe_copy_unified_arg(0, (void*)&pgpuMetadata[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuMetadata) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2817,11 +4120,19 @@ nvmlReturn_t nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata
         rpc_read(0, pgpuMetadata, *bufferSize * sizeof(nvmlVgpuPgpuMetadata_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*bufferSize); i++)
+       maybe_copy_unified_arg(0, (void*)&pgpuMetadata[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t* vgpuMetadata, nvmlVgpuPgpuMetadata_t* pgpuMetadata, nvmlVgpuPgpuCompatibility_t* compatibilityInfo)
 {
+    maybe_copy_unified_arg(0, (void*)vgpuMetadata, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)compatibilityInfo, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGetVgpuCompatibility) < 0 ||
         rpc_write(0, vgpuMetadata, sizeof(nvmlVgpuMetadata_t)) < 0 ||
@@ -2831,11 +4142,19 @@ nvmlReturn_t nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t* vgpuMetadata, nvmlVgpu
         rpc_read(0, compatibilityInfo, sizeof(nvmlVgpuPgpuCompatibility_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)vgpuMetadata, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)compatibilityInfo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char* pgpuMetadata, unsigned int* bufferSize)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*bufferSize); i++)
+       maybe_copy_unified_arg(0, (void*)&pgpuMetadata[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetPgpuMetadataString) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2845,11 +4164,18 @@ nvmlReturn_t nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char* pgpuMeta
         rpc_read(0, pgpuMetadata, *bufferSize * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*bufferSize); i++)
+       maybe_copy_unified_arg(0, (void*)&pgpuMetadata[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedulerLog_t* pSchedulerLog)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pSchedulerLog, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuSchedulerLog) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2857,11 +4183,15 @@ nvmlReturn_t nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedule
         rpc_read(0, pSchedulerLog, sizeof(nvmlVgpuSchedulerLog_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pSchedulerLog, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerGetState_t* pSchedulerState)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pSchedulerState, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuSchedulerState) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2869,11 +4199,15 @@ nvmlReturn_t nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedu
         rpc_read(0, pSchedulerState, sizeof(nvmlVgpuSchedulerGetState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pSchedulerState, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgpuSchedulerCapabilities_t* pCapabilities)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pCapabilities, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuSchedulerCapabilities) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2881,11 +4215,15 @@ nvmlReturn_t nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgp
         rpc_read(0, pCapabilities, sizeof(nvmlVgpuSchedulerCapabilities_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pCapabilities, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGetVgpuVersion(nvmlVgpuVersion_t* supported, nvmlVgpuVersion_t* current)
 {
+    maybe_copy_unified_arg(0, (void*)supported, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)current, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGetVgpuVersion) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -2893,22 +4231,33 @@ nvmlReturn_t nvmlGetVgpuVersion(nvmlVgpuVersion_t* supported, nvmlVgpuVersion_t*
         rpc_read(0, current, sizeof(nvmlVgpuVersion_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)supported, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)current, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlSetVgpuVersion(nvmlVgpuVersion_t* vgpuVersion)
 {
+    maybe_copy_unified_arg(0, (void*)vgpuVersion, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlSetVgpuVersion) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, vgpuVersion, sizeof(nvmlVgpuVersion_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)vgpuVersion, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, nvmlValueType_t* sampleValType, unsigned int* vgpuInstanceSamplesCount, nvmlVgpuInstanceUtilizationSample_t* utilizationSamples)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sampleValType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuInstanceSamplesCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)utilizationSamples, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*vgpuInstanceSamplesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&utilizationSamples[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuUtilization) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2921,11 +4270,24 @@ nvmlReturn_t nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long lon
         rpc_read(0, utilizationSamples, *vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sampleValType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuInstanceSamplesCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)utilizationSamples, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*vgpuInstanceSamplesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&utilizationSamples[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, unsigned int* vgpuProcessSamplesCount, nvmlVgpuProcessUtilizationSample_t* utilizationSamples)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)vgpuProcessSamplesCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)utilizationSamples, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*vgpuProcessSamplesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&utilizationSamples[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuProcessUtilization) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -2936,11 +4298,19 @@ nvmlReturn_t nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned l
         rpc_read(0, utilizationSamples, *vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)vgpuProcessSamplesCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)utilizationSamples, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*vgpuProcessSamplesCount); i++)
+       maybe_copy_unified_arg(0, (void*)&utilizationSamples[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t* mode)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetAccountingMode) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2948,11 +4318,18 @@ nvmlReturn_t nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance,
         rpc_read(0, mode, sizeof(nvmlEnableState_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int* count, unsigned int* pids)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pids, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&pids[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetAccountingPids) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2962,11 +4339,19 @@ nvmlReturn_t nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance,
         rpc_read(0, pids, *count * sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pids, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&pids[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t* stats)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)stats, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetAccountingStats) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2975,22 +4360,29 @@ nvmlReturn_t nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance,
         rpc_read(0, stats, sizeof(nvmlAccountingStats_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)stats, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceClearAccountingPids) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t* licenseInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)licenseInfo, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetLicenseInfo_v2) < 0 ||
         rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 ||
@@ -2998,22 +4390,28 @@ nvmlReturn_t nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance,
         rpc_read(0, licenseInfo, sizeof(nvmlVgpuLicenseInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)licenseInfo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGetExcludedDeviceCount(unsigned int* deviceCount)
 {
+    maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGetExcludedDeviceCount) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, deviceCount, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDeviceInfo_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGetExcludedDeviceInfoByIndex) < 0 ||
         rpc_write(0, &index, sizeof(unsigned int)) < 0 ||
@@ -3021,11 +4419,16 @@ nvmlReturn_t nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDe
         rpc_read(0, info, sizeof(nvmlExcludedDeviceInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t* activationStatus)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)activationStatus, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetMigMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3034,11 +4437,17 @@ nvmlReturn_t nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlRe
         rpc_read(0, activationStatus, sizeof(nvmlReturn_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)activationStatus, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int* currentMode, unsigned int* pendingMode)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)currentMode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pendingMode, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMigMode) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3047,11 +4456,17 @@ nvmlReturn_t nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int* currentMode
         rpc_read(0, pendingMode, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)currentMode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pendingMode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, nvmlGpuInstanceProfileInfo_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceProfileInfo) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3060,11 +4475,17 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned i
         rpc_read(0, info, sizeof(nvmlGpuInstanceProfileInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, nvmlGpuInstanceProfileInfo_v2_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceProfileInfoV) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3073,11 +4494,20 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned
         rpc_read(0, info, sizeof(nvmlGpuInstanceProfileInfo_v2_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t* placements, unsigned int* count)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)placements, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&placements[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstancePossiblePlacements_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3088,11 +4518,20 @@ nvmlReturn_t nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device,
         rpc_read(0, placements, *count * sizeof(nvmlGpuInstancePlacement_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)placements, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&placements[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, unsigned int* count)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceRemainingCapacity) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3101,11 +4540,17 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsi
         rpc_read(0, count, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstance_t* gpuInstance)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)gpuInstance, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceCreateGpuInstance) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3114,22 +4559,33 @@ nvmlReturn_t nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profi
         rpc_read(0, gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)gpuInstance, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceDestroy) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstance_t* gpuInstances, unsigned int* count)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)gpuInstances, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&gpuInstances[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstances) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3140,11 +4596,20 @@ nvmlReturn_t nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profile
         rpc_read(0, gpuInstances, *count * sizeof(nvmlGpuInstance_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)gpuInstances, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&gpuInstances[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t* gpuInstance)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&id, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)gpuInstance, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceById) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3153,11 +4618,16 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id,
         rpc_read(0, gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&id, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)gpuInstance, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceGetInfo) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
@@ -3165,11 +4635,17 @@ nvmlReturn_t nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstan
         rpc_read(0, info, sizeof(nvmlGpuInstanceInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, unsigned int engProfile, nvmlComputeInstanceProfileInfo_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&engProfile, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstanceProfileInfo) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
@@ -3179,11 +4655,19 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuI
         rpc_read(0, info, sizeof(nvmlComputeInstanceProfileInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&engProfile, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile, unsigned int engProfile, nvmlComputeInstanceProfileInfo_v2_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&engProfile, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstanceProfileInfoV) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
@@ -3193,11 +4677,18 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpu
         rpc_read(0, info, sizeof(nvmlComputeInstanceProfileInfo_v2_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&engProfile, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, unsigned int profileId, unsigned int* count)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstanceRemainingCapacity) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
@@ -3206,11 +4697,20 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_
         rpc_read(0, count, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance_t gpuInstance, unsigned int profileId, nvmlComputeInstancePlacement_t* placements, unsigned int* count)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)placements, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&placements[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstancePossiblePlacements) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
@@ -3221,11 +4721,20 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance
         rpc_read(0, placements, *count * sizeof(nvmlComputeInstancePlacement_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)placements, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&placements[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, nvmlComputeInstance_t* computeInstance)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)computeInstance, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceCreateComputeInstance) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
@@ -3234,22 +4743,33 @@ nvmlReturn_t nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance,
         rpc_read(0, computeInstance, sizeof(nvmlComputeInstance_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)computeInstance, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance)
 {
+    maybe_copy_unified_arg(0, (void*)&computeInstance, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlComputeInstanceDestroy) < 0 ||
         rpc_write(0, &computeInstance, sizeof(nvmlComputeInstance_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&computeInstance, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, nvmlComputeInstance_t* computeInstances, unsigned int* count)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)computeInstances, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&computeInstances[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstances) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
@@ -3260,11 +4780,20 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, u
         rpc_read(0, computeInstances, *count * sizeof(nvmlComputeInstance_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)computeInstances, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*count); i++)
+       maybe_copy_unified_arg(0, (void*)&computeInstances[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, nvmlComputeInstance_t* computeInstance)
 {
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&id, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)computeInstance, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstanceById) < 0 ||
         rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 ||
@@ -3273,11 +4802,16 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance
         rpc_read(0, computeInstance, sizeof(nvmlComputeInstance_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&id, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)computeInstance, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&computeInstance, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlComputeInstanceGetInfo_v2) < 0 ||
         rpc_write(0, &computeInstance, sizeof(nvmlComputeInstance_t)) < 0 ||
@@ -3285,11 +4819,15 @@ nvmlReturn_t nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance
         rpc_read(0, info, sizeof(nvmlComputeInstanceInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&computeInstance, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int* isMigDevice)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isMigDevice, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceIsMigDeviceHandle) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3297,11 +4835,15 @@ nvmlReturn_t nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int* isMi
         rpc_read(0, isMigDevice, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isMigDevice, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int* id)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)id, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceId) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3309,11 +4851,15 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int* id)
         rpc_read(0, id, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)id, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int* id)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)id, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetComputeInstanceId) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3321,11 +4867,15 @@ nvmlReturn_t nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int* i
         rpc_read(0, id, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)id, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int* count)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMaxMigDeviceCount) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3333,11 +4883,16 @@ nvmlReturn_t nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int* c
         rpc_read(0, count, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, nvmlDevice_t* migDevice)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)migDevice, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMigDeviceHandleByIndex) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3346,11 +4901,16 @@ nvmlReturn_t nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned i
         rpc_read(0, migDevice, sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)migDevice, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t* device)
 {
+    maybe_copy_unified_arg(0, (void*)&migDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDeviceHandleFromMigDeviceHandle) < 0 ||
         rpc_write(0, &migDevice, sizeof(nvmlDevice_t)) < 0 ||
@@ -3358,11 +4918,15 @@ nvmlReturn_t nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice
         rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&migDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t* type)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)type, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetBusType) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3370,11 +4934,15 @@ nvmlReturn_t nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t* type)
         rpc_read(0, type, sizeof(nvmlBusType_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)type, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t* pDynamicPstatesInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDynamicPstatesInfo, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetDynamicPstatesInfo) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3382,11 +4950,16 @@ nvmlReturn_t nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamic
         rpc_read(0, pDynamicPstatesInfo, sizeof(nvmlGpuDynamicPstatesInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDynamicPstatesInfo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&speed, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetFanSpeed_v2) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3395,11 +4968,16 @@ nvmlReturn_t nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, uns
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&speed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int* offset)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)offset, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpcClkVfOffset) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3407,11 +4985,15 @@ nvmlReturn_t nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int* offset)
         rpc_read(0, offset, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)offset, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetGpcClkVfOffset) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3419,11 +5001,15 @@ nvmlReturn_t nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int* offset)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)offset, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMemClkVfOffset) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3431,11 +5017,15 @@ nvmlReturn_t nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int* offset)
         rpc_read(0, offset, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)offset, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetMemClkVfOffset) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3443,11 +5033,18 @@ nvmlReturn_t nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, unsigned int* minClockMHz, unsigned int* maxClockMHz)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&pstate, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)minClockMHz, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxClockMHz, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMinMaxClockOfPState) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3458,11 +5055,21 @@ nvmlReturn_t nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType
         rpc_read(0, maxClockMHz, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&pstate, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)minClockMHz, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxClockMHz, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, nvmlPstates_t* pstates, unsigned int size)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pstates, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&pstates[i], cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedPerformanceStates) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3471,11 +5078,19 @@ nvmlReturn_t nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, nvmlPs
         rpc_read(0, pstates, size * sizeof(nvmlPstates_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pstates, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(size); i++)
+       maybe_copy_unified_arg(0, (void*)&pstates[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, int* minOffset, int* maxOffset)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)minOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxOffset, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpcClkMinMaxVfOffset) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3484,11 +5099,17 @@ nvmlReturn_t nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, int* minOffs
         rpc_read(0, maxOffset, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)minOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxOffset, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, int* minOffset, int* maxOffset)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)minOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxOffset, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetMemClkMinMaxVfOffset) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3497,11 +5118,16 @@ nvmlReturn_t nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, int* minOffs
         rpc_read(0, maxOffset, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)minOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxOffset, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)gpuFabricInfo, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceGetGpuFabricInfo) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3509,44 +5135,54 @@ nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t
         rpc_read(0, gpuFabricInfo, sizeof(nvmlGpuFabricInfo_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)gpuFabricInfo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpmMetricsGet(nvmlGpmMetricsGet_t* metricsGet)
 {
+    maybe_copy_unified_arg(0, (void*)metricsGet, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpmMetricsGet) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, metricsGet, sizeof(nvmlGpmMetricsGet_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)metricsGet, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpmSampleFree(nvmlGpmSample_t gpmSample)
 {
+    maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpmSampleFree) < 0 ||
         rpc_write(0, &gpmSample, sizeof(nvmlGpmSample_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpmSampleAlloc(nvmlGpmSample_t* gpmSample)
 {
+    maybe_copy_unified_arg(0, (void*)gpmSample, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpmSampleAlloc) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, gpmSample, sizeof(nvmlGpmSample_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)gpmSample, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpmSampleGet) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3554,11 +5190,16 @@ nvmlReturn_t nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gpuInstanceId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpmMigSampleGet) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3567,11 +5208,16 @@ nvmlReturn_t nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gpuInstanceId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t* gpmSupport)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)gpmSupport, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlGpmQueryDeviceSupport) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3579,11 +5225,15 @@ nvmlReturn_t nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t* gp
         rpc_read(0, gpmSupport, sizeof(nvmlGpmSupport_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)gpmSupport, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 nvmlReturn_t nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvmlNvLinkPowerThres_t* info)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     nvmlReturn_t return_value;
     if (rpc_start_request(0, RPC_nvmlDeviceSetNvLinkDeviceLowPowerThreshold) < 0 ||
         rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 ||
@@ -3591,33 +5241,41 @@ nvmlReturn_t nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvm
         rpc_read(0, info, sizeof(nvmlNvLinkPowerThres_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return NVML_ERROR_GPU_IS_LOST;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuInit(unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuInit) < 0 ||
         rpc_write(0, &Flags, sizeof(unsigned int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDriverGetVersion(int* driverVersion)
 {
+    maybe_copy_unified_arg(0, (void*)driverVersion, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDriverGetVersion) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, driverVersion, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)driverVersion, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGet(CUdevice* device, int ordinal)
 {
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ordinal, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGet) < 0 ||
         rpc_write(0, &ordinal, sizeof(int)) < 0 ||
@@ -3625,22 +5283,31 @@ CUresult cuDeviceGet(CUdevice* device, int ordinal)
         rpc_read(0, device, sizeof(CUdevice)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ordinal, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetCount(int* count)
 {
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetCount) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, count, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetName(char* name, int len, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(len); i++)
+       maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetName) < 0 ||
         rpc_write(0, &len, sizeof(int)) < 0 ||
@@ -3649,11 +5316,20 @@ CUresult cuDeviceGetName(char* name, int len, CUdevice dev)
         rpc_read(0, name, len * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(len); i++)
+       maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice);
+    for (int i = 0; i < 16; i++)
+       maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetUuid) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3661,11 +5337,19 @@ CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev)
         rpc_read(0, uuid, 16) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < 16; i++)
+       maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetUuid_v2(CUuuid* uuid, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice);
+    for (int i = 0; i < 16; i++)
+       maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetUuid_v2) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3673,11 +5357,18 @@ CUresult cuDeviceGetUuid_v2(CUuuid* uuid, CUdevice dev)
         rpc_read(0, uuid, 16) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < 16; i++)
+       maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetLuid(char* luid, unsigned int* deviceNodeMask, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)luid, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)deviceNodeMask, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t luid_len;
     if (rpc_start_request(0, RPC_cuDeviceGetLuid) < 0 ||
@@ -3688,11 +5379,16 @@ CUresult cuDeviceGetLuid(char* luid, unsigned int* deviceNodeMask, CUdevice dev)
         rpc_read(0, deviceNodeMask, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)luid, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)deviceNodeMask, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceTotalMem_v2(size_t* bytes, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceTotalMem_v2) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3700,11 +5396,17 @@ CUresult cuDeviceTotalMem_v2(size_t* bytes, CUdevice dev)
         rpc_read(0, bytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)maxWidthInElements, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numChannels, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetTexture1DLinearMaxWidth) < 0 ||
         rpc_write(0, &format, sizeof(CUarray_format)) < 0 ||
@@ -3714,11 +5416,18 @@ CUresult cuDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, CUarray_
         rpc_read(0, maxWidthInElements, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)maxWidthInElements, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numChannels, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetAttribute) < 0 ||
         rpc_write(0, &attrib, sizeof(CUdevice_attribute)) < 0 ||
@@ -3727,11 +5436,16 @@ CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev)
         rpc_read(0, pi, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool)
 {
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceSetMemPool) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3739,11 +5453,15 @@ CUresult cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetMemPool(CUmemoryPool* pool, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)pool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetMemPool) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3751,11 +5469,15 @@ CUresult cuDeviceGetMemPool(CUmemoryPool* pool, CUdevice dev)
         rpc_read(0, pool, sizeof(CUmemoryPool)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetDefaultMemPool(CUmemoryPool* pool_out, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)pool_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetDefaultMemPool) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3763,11 +5485,16 @@ CUresult cuDeviceGetDefaultMemPool(CUmemoryPool* pool_out, CUdevice dev)
         rpc_read(0, pool_out, sizeof(CUmemoryPool)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pool_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetExecAffinitySupport(int* pi, CUexecAffinityType type, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetExecAffinitySupport) < 0 ||
         rpc_write(0, &type, sizeof(CUexecAffinityType)) < 0 ||
@@ -3776,11 +5503,16 @@ CUresult cuDeviceGetExecAffinitySupport(int* pi, CUexecAffinityType type, CUdevi
         rpc_read(0, pi, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope)
 {
+    maybe_copy_unified_arg(0, (void*)&target, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuFlushGPUDirectRDMAWrites) < 0 ||
         rpc_write(0, &target, sizeof(CUflushGPUDirectRDMAWritesTarget)) < 0 ||
@@ -3788,11 +5520,15 @@ CUresult cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUf
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&target, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetProperties) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3800,11 +5536,16 @@ CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev)
         rpc_read(0, prop, sizeof(CUdevprop)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)major, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)minor, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceComputeCapability) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3813,11 +5554,16 @@ CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev)
         rpc_read(0, minor, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)major, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)minor, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDevicePrimaryCtxRetain(CUcontext* pctx, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDevicePrimaryCtxRetain) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3825,22 +5571,28 @@ CUresult cuDevicePrimaryCtxRetain(CUcontext* pctx, CUdevice dev)
         rpc_read(0, pctx, sizeof(CUcontext)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDevicePrimaryCtxRelease_v2(CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDevicePrimaryCtxRelease_v2) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDevicePrimaryCtxSetFlags_v2(CUdevice dev, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDevicePrimaryCtxSetFlags_v2) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3848,11 +5600,16 @@ CUresult cuDevicePrimaryCtxSetFlags_v2(CUdevice dev, unsigned int flags)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int* active)
 {
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)active, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDevicePrimaryCtxGetState) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
@@ -3861,22 +5618,30 @@ CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int* acti
         rpc_read(0, active, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)active, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDevicePrimaryCtxReset_v2(CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDevicePrimaryCtxReset_v2) < 0 ||
         rpc_write(0, &dev, sizeof(CUdevice)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxCreate_v2) < 0 ||
         rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
@@ -3885,11 +5650,21 @@ CUresult cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev)
         rpc_read(0, pctx, sizeof(CUcontext)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxCreate_v3(CUcontext* pctx, CUexecAffinityParam* paramsArray, int numParams, unsigned int flags, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numParams, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numParams); i++)
+       maybe_copy_unified_arg(0, (void*)&paramsArray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxCreate_v3) < 0 ||
         rpc_write(0, &numParams, sizeof(int)) < 0 ||
@@ -3900,88 +5675,111 @@ CUresult cuCtxCreate_v3(CUcontext* pctx, CUexecAffinityParam* paramsArray, int n
         rpc_read(0, paramsArray, numParams * sizeof(CUexecAffinityParam)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numParams, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numParams); i++)
+       maybe_copy_unified_arg(0, (void*)&paramsArray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxDestroy_v2(CUcontext ctx)
 {
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxDestroy_v2) < 0 ||
         rpc_write(0, &ctx, sizeof(CUcontext)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxPushCurrent_v2(CUcontext ctx)
 {
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxPushCurrent_v2) < 0 ||
         rpc_write(0, &ctx, sizeof(CUcontext)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxPopCurrent_v2(CUcontext* pctx)
 {
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxPopCurrent_v2) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, pctx, sizeof(CUcontext)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxSetCurrent(CUcontext ctx)
 {
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxSetCurrent) < 0 ||
         rpc_write(0, &ctx, sizeof(CUcontext)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetCurrent(CUcontext* pctx)
 {
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetCurrent) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, pctx, sizeof(CUcontext)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetDevice(CUdevice* device)
 {
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetDevice) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, device, sizeof(CUdevice)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetFlags(unsigned int* flags)
 {
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetFlags) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, flags, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetId(CUcontext ctx, unsigned long long* ctxId)
 {
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)ctxId, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetId) < 0 ||
         rpc_write(0, &ctx, sizeof(CUcontext)) < 0 ||
@@ -3989,6 +5787,8 @@ CUresult cuCtxGetId(CUcontext ctx, unsigned long long* ctxId)
         rpc_read(0, ctxId, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)ctxId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -4004,6 +5804,8 @@ CUresult cuCtxSynchronize()
 
 CUresult cuCtxSetLimit(CUlimit limit, size_t value)
 {
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxSetLimit) < 0 ||
         rpc_write(0, &limit, sizeof(CUlimit)) < 0 ||
@@ -4011,11 +5813,15 @@ CUresult cuCtxSetLimit(CUlimit limit, size_t value)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetLimit(size_t* pvalue, CUlimit limit)
 {
+    maybe_copy_unified_arg(0, (void*)pvalue, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetLimit) < 0 ||
         rpc_write(0, &limit, sizeof(CUlimit)) < 0 ||
@@ -4023,55 +5829,67 @@ CUresult cuCtxGetLimit(size_t* pvalue, CUlimit limit)
         rpc_read(0, pvalue, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pvalue, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetCacheConfig(CUfunc_cache* pconfig)
 {
+    maybe_copy_unified_arg(0, (void*)pconfig, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetCacheConfig) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, pconfig, sizeof(CUfunc_cache)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pconfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxSetCacheConfig(CUfunc_cache config)
 {
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxSetCacheConfig) < 0 ||
         rpc_write(0, &config, sizeof(CUfunc_cache)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetSharedMemConfig(CUsharedconfig* pConfig)
 {
+    maybe_copy_unified_arg(0, (void*)pConfig, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetSharedMemConfig) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, pConfig, sizeof(CUsharedconfig)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxSetSharedMemConfig(CUsharedconfig config)
 {
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxSetSharedMemConfig) < 0 ||
         rpc_write(0, &config, sizeof(CUsharedconfig)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetApiVersion(CUcontext ctx, unsigned int* version)
 {
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetApiVersion) < 0 ||
         rpc_write(0, &ctx, sizeof(CUcontext)) < 0 ||
@@ -4079,11 +5897,15 @@ CUresult cuCtxGetApiVersion(CUcontext ctx, unsigned int* version)
         rpc_read(0, version, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxGetStreamPriorityRange(int* leastPriority, int* greatestPriority)
 {
+    maybe_copy_unified_arg(0, (void*)leastPriority, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)greatestPriority, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetStreamPriorityRange) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -4091,6 +5913,8 @@ CUresult cuCtxGetStreamPriorityRange(int* leastPriority, int* greatestPriority)
         rpc_read(0, greatestPriority, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)leastPriority, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)greatestPriority, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -4106,6 +5930,8 @@ CUresult cuCtxResetPersistingL2Cache()
 
 CUresult cuCtxGetExecAffinity(CUexecAffinityParam* pExecAffinity, CUexecAffinityType type)
 {
+    maybe_copy_unified_arg(0, (void*)pExecAffinity, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxGetExecAffinity) < 0 ||
         rpc_write(0, &type, sizeof(CUexecAffinityType)) < 0 ||
@@ -4113,11 +5939,15 @@ CUresult cuCtxGetExecAffinity(CUexecAffinityParam* pExecAffinity, CUexecAffinity
         rpc_read(0, pExecAffinity, sizeof(CUexecAffinityParam)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pExecAffinity, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxAttach(CUcontext* pctx, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxAttach) < 0 ||
         rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
@@ -4125,22 +5955,28 @@ CUresult cuCtxAttach(CUcontext* pctx, unsigned int flags)
         rpc_read(0, pctx, sizeof(CUcontext)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxDetach(CUcontext ctx)
 {
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxDetach) < 0 ||
         rpc_write(0, &ctx, sizeof(CUcontext)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuModuleLoad(CUmodule* module, const char* fname)
 {
+    maybe_copy_unified_arg(0, (void*)module, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fname, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t fname_len = std::strlen(fname) + 1;
     if (rpc_start_request(0, RPC_cuModuleLoad) < 0 ||
@@ -4150,22 +5986,27 @@ CUresult cuModuleLoad(CUmodule* module, const char* fname)
         rpc_read(0, module, sizeof(CUmodule)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)module, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fname, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuModuleUnload(CUmodule hmod)
 {
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuModuleUnload) < 0 ||
         rpc_write(0, &hmod, sizeof(CUmodule)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuModuleGetLoadingMode(CUmoduleLoadingMode* mode)
 {
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuModuleGetLoadingMode) < 0 ||
         rpc_write(0, mode, sizeof(CUmoduleLoadingMode)) < 0 ||
@@ -4173,11 +6014,15 @@ CUresult cuModuleGetLoadingMode(CUmoduleLoadingMode* mode)
         rpc_read(0, mode, sizeof(CUmoduleLoadingMode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name)
 {
+    maybe_copy_unified_arg(0, (void*)hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t name_len = std::strlen(name) + 1;
     if (rpc_start_request(0, RPC_cuModuleGetFunction) < 0 ||
@@ -4188,11 +6033,18 @@ CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name)
         rpc_read(0, hfunc, sizeof(CUfunction)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuModuleGetGlobal_v2(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name)
 {
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t name_len = std::strlen(name) + 1;
     if (rpc_start_request(0, RPC_cuModuleGetGlobal_v2) < 0 ||
@@ -4204,11 +6056,19 @@ CUresult cuModuleGetGlobal_v2(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, c
         rpc_read(0, bytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLinkCreate_v2(unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut)
 {
+    maybe_copy_unified_arg(0, (void*)&numOptions, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)options, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)optionValues, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)stateOut, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLinkCreate_v2) < 0 ||
         rpc_write(0, &numOptions, sizeof(unsigned int)) < 0 ||
@@ -4221,11 +6081,25 @@ CUresult cuLinkCreate_v2(unsigned int numOptions, CUjit_option* options, void**
         rpc_read(0, stateOut, sizeof(CUlinkState)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&numOptions, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)options, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)optionValues, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)stateOut, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions, CUjit_option* options, void** optionValues)
 {
+    maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)path, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numOptions, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)options, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&options[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)optionValues, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&optionValues[i], cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t path_len = std::strlen(path) + 1;
     if (rpc_start_request(0, RPC_cuLinkAddFile_v2) < 0 ||
@@ -4239,11 +6113,24 @@ CUresult cuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char* pa
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)path, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numOptions, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)options, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&options[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)optionValues, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&optionValues[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut)
 {
+    maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)cubinOut, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeOut, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLinkComplete) < 0 ||
         rpc_write(0, &state, sizeof(CUlinkState)) < 0 ||
@@ -4252,22 +6139,30 @@ CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut)
         rpc_read(0, sizeOut, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)cubinOut, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeOut, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLinkDestroy(CUlinkState state)
 {
+    maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLinkDestroy) < 0 ||
         rpc_write(0, &state, sizeof(CUlinkState)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuModuleGetTexRef(CUtexref* pTexRef, CUmodule hmod, const char* name)
 {
+    maybe_copy_unified_arg(0, (void*)pTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t name_len = std::strlen(name) + 1;
     if (rpc_start_request(0, RPC_cuModuleGetTexRef) < 0 ||
@@ -4278,11 +6173,17 @@ CUresult cuModuleGetTexRef(CUtexref* pTexRef, CUmodule hmod, const char* name)
         rpc_read(0, pTexRef, sizeof(CUtexref)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuModuleGetSurfRef(CUsurfref* pSurfRef, CUmodule hmod, const char* name)
 {
+    maybe_copy_unified_arg(0, (void*)pSurfRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t name_len = std::strlen(name) + 1;
     if (rpc_start_request(0, RPC_cuModuleGetSurfRef) < 0 ||
@@ -4293,11 +6194,30 @@ CUresult cuModuleGetSurfRef(CUsurfref* pSurfRef, CUmodule hmod, const char* name
         rpc_read(0, pSurfRef, sizeof(CUsurfref)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pSurfRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLibraryLoadFromFile(CUlibrary* library, const char* fileName, CUjit_option* jitOptions, void** jitOptionsValues, unsigned int numJitOptions, CUlibraryOption* libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions)
 {
+    maybe_copy_unified_arg(0, (void*)library, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fileName, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numJitOptions, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)jitOptions, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numJitOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&jitOptions[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)jitOptionsValues, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numJitOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&jitOptionsValues[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numLibraryOptions, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)libraryOptions, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numLibraryOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&libraryOptions[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)libraryOptionValues, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numLibraryOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&libraryOptionValues[i], cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t fileName_len = std::strlen(fileName) + 1;
     if (rpc_start_request(0, RPC_cuLibraryLoadFromFile) < 0 ||
@@ -4313,22 +6233,43 @@ CUresult cuLibraryLoadFromFile(CUlibrary* library, const char* fileName, CUjit_o
         rpc_read(0, library, sizeof(CUlibrary)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)library, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fileName, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numJitOptions, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)jitOptions, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numJitOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&jitOptions[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)jitOptionsValues, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numJitOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&jitOptionsValues[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numLibraryOptions, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)libraryOptions, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numLibraryOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&libraryOptions[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)libraryOptionValues, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numLibraryOptions); i++)
+       maybe_copy_unified_arg(0, (void*)&libraryOptionValues[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLibraryUnload(CUlibrary library)
 {
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLibraryUnload) < 0 ||
         rpc_write(0, &library, sizeof(CUlibrary)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLibraryGetKernel(CUkernel* pKernel, CUlibrary library, const char* name)
 {
+    maybe_copy_unified_arg(0, (void*)pKernel, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t name_len = std::strlen(name) + 1;
     if (rpc_start_request(0, RPC_cuLibraryGetKernel) < 0 ||
@@ -4339,11 +6280,16 @@ CUresult cuLibraryGetKernel(CUkernel* pKernel, CUlibrary library, const char* na
         rpc_read(0, pKernel, sizeof(CUkernel)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pKernel, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLibraryGetModule(CUmodule* pMod, CUlibrary library)
 {
+    maybe_copy_unified_arg(0, (void*)pMod, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLibraryGetModule) < 0 ||
         rpc_write(0, &library, sizeof(CUlibrary)) < 0 ||
@@ -4351,11 +6297,15 @@ CUresult cuLibraryGetModule(CUmodule* pMod, CUlibrary library)
         rpc_read(0, pMod, sizeof(CUmodule)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pMod, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuKernelGetFunction(CUfunction* pFunc, CUkernel kernel)
 {
+    maybe_copy_unified_arg(0, (void*)pFunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuKernelGetFunction) < 0 ||
         rpc_write(0, &kernel, sizeof(CUkernel)) < 0 ||
@@ -4363,11 +6313,17 @@ CUresult cuKernelGetFunction(CUfunction* pFunc, CUkernel kernel)
         rpc_read(0, pFunc, sizeof(CUfunction)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pFunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLibraryGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUlibrary library, const char* name)
 {
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t name_len = std::strlen(name) + 1;
     if (rpc_start_request(0, RPC_cuLibraryGetGlobal) < 0 ||
@@ -4379,11 +6335,19 @@ CUresult cuLibraryGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUlibrary library,
         rpc_read(0, bytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLibraryGetManaged(CUdeviceptr* dptr, size_t* bytes, CUlibrary library, const char* name)
 {
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t name_len = std::strlen(name) + 1;
     if (rpc_start_request(0, RPC_cuLibraryGetManaged) < 0 ||
@@ -4395,11 +6359,18 @@ CUresult cuLibraryGetManaged(CUdeviceptr* dptr, size_t* bytes, CUlibrary library
         rpc_read(0, bytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLibraryGetUnifiedFunction(void** fptr, CUlibrary library, const char* symbol)
 {
+    maybe_copy_unified_arg(0, (void*)fptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t symbol_len = std::strlen(symbol) + 1;
     if (rpc_start_request(0, RPC_cuLibraryGetUnifiedFunction) < 0 ||
@@ -4410,11 +6381,18 @@ CUresult cuLibraryGetUnifiedFunction(void** fptr, CUlibrary library, const char*
         rpc_read(0, fptr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)fptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuKernelGetAttribute(int* pi, CUfunction_attribute attrib, CUkernel kernel, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuKernelGetAttribute) < 0 ||
         rpc_write(0, pi, sizeof(int)) < 0 ||
@@ -4425,11 +6403,19 @@ CUresult cuKernelGetAttribute(int* pi, CUfunction_attribute attrib, CUkernel ker
         rpc_read(0, pi, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuKernelSetAttribute(CUfunction_attribute attrib, int val, CUkernel kernel, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&val, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuKernelSetAttribute) < 0 ||
         rpc_write(0, &attrib, sizeof(CUfunction_attribute)) < 0 ||
@@ -4439,11 +6425,18 @@ CUresult cuKernelSetAttribute(CUfunction_attribute attrib, int val, CUkernel ker
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&val, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuKernelSetCacheConfig) < 0 ||
         rpc_write(0, &kernel, sizeof(CUkernel)) < 0 ||
@@ -4452,11 +6445,16 @@ CUresult cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config, CUdevice d
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemGetInfo_v2(size_t* free, size_t* total)
 {
+    maybe_copy_unified_arg(0, (void*)free, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)total, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemGetInfo_v2) < 0 ||
         rpc_write(0, free, sizeof(size_t)) < 0 ||
@@ -4466,11 +6464,15 @@ CUresult cuMemGetInfo_v2(size_t* free, size_t* total)
         rpc_read(0, total, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)free, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)total, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAlloc_v2(CUdeviceptr* dptr, size_t bytesize)
 {
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAlloc_v2) < 0 ||
         rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 ||
@@ -4479,11 +6481,18 @@ CUresult cuMemAlloc_v2(CUdeviceptr* dptr, size_t bytesize)
         rpc_read(0, dptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAllocPitch_v2(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes)
 {
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pPitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&WidthInBytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ElementSizeBytes, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAllocPitch_v2) < 0 ||
         rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 ||
@@ -4496,22 +6505,32 @@ CUresult cuMemAllocPitch_v2(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInByt
         rpc_read(0, pPitch, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pPitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&WidthInBytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ElementSizeBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemFree_v2(CUdeviceptr dptr)
 {
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemFree_v2) < 0 ||
         rpc_write(0, &dptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemGetAddressRange_v2(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr)
 {
+    maybe_copy_unified_arg(0, (void*)pbase, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)psize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemGetAddressRange_v2) < 0 ||
         rpc_write(0, pbase, sizeof(CUdeviceptr)) < 0 ||
@@ -4522,11 +6541,16 @@ CUresult cuMemGetAddressRange_v2(CUdeviceptr* pbase, size_t* psize, CUdeviceptr
         rpc_read(0, psize, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pbase, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)psize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAllocHost_v2(void** pp, size_t bytesize)
 {
+    maybe_copy_unified_arg(0, (void*)pp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAllocHost_v2) < 0 ||
         rpc_write(0, &bytesize, sizeof(size_t)) < 0 ||
@@ -4534,22 +6558,29 @@ CUresult cuMemAllocHost_v2(void** pp, size_t bytesize)
         rpc_read(0, pp, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemFreeHost(void* p)
 {
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemFreeHost) < 0 ||
         rpc_write(0, &p, sizeof(void*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)pp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemHostAlloc) < 0 ||
         rpc_write(0, &bytesize, sizeof(size_t)) < 0 ||
@@ -4558,11 +6589,17 @@ CUresult cuMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags)
         rpc_read(0, pp, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemHostGetDevicePointer_v2(CUdeviceptr* pdptr, void* p, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemHostGetDevicePointer_v2) < 0 ||
         rpc_write(0, pdptr, sizeof(CUdeviceptr)) < 0 ||
@@ -4572,11 +6609,16 @@ CUresult cuMemHostGetDevicePointer_v2(CUdeviceptr* pdptr, void* p, unsigned int
         rpc_read(0, pdptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemHostGetFlags(unsigned int* pFlags, void* p)
 {
+    maybe_copy_unified_arg(0, (void*)pFlags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemHostGetFlags) < 0 ||
         rpc_write(0, pFlags, sizeof(unsigned int)) < 0 ||
@@ -4585,11 +6627,16 @@ CUresult cuMemHostGetFlags(unsigned int* pFlags, void* p)
         rpc_read(0, pFlags, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pFlags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAllocManaged(CUdeviceptr* dptr, size_t bytesize, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAllocManaged) < 0 ||
         rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 ||
@@ -4599,11 +6646,16 @@ CUresult cuMemAllocManaged(CUdeviceptr* dptr, size_t bytesize, unsigned int flag
         rpc_read(0, dptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetByPCIBusId(CUdevice* dev, const char* pciBusId)
 {
+    maybe_copy_unified_arg(0, (void*)dev, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice);
     CUresult return_value;
     std::size_t pciBusId_len = std::strlen(pciBusId) + 1;
     if (rpc_start_request(0, RPC_cuDeviceGetByPCIBusId) < 0 ||
@@ -4614,11 +6666,18 @@ CUresult cuDeviceGetByPCIBusId(CUdevice* dev, const char* pciBusId)
         rpc_read(0, dev, sizeof(CUdevice)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dev, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetPCIBusId(char* pciBusId, int len, CUdevice dev)
 {
+    maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(len); i++)
+       maybe_copy_unified_arg(0, (void*)&pciBusId[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetPCIBusId) < 0 ||
         rpc_write(0, &len, sizeof(int)) < 0 ||
@@ -4627,11 +6686,18 @@ CUresult cuDeviceGetPCIBusId(char* pciBusId, int len, CUdevice dev)
         rpc_read(0, pciBusId, len * sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(len); i++)
+       maybe_copy_unified_arg(0, (void*)&pciBusId[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuIpcGetEventHandle(CUipcEventHandle* pHandle, CUevent event)
 {
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuIpcGetEventHandle) < 0 ||
         rpc_write(0, pHandle, sizeof(CUipcEventHandle)) < 0 ||
@@ -4640,11 +6706,15 @@ CUresult cuIpcGetEventHandle(CUipcEventHandle* pHandle, CUevent event)
         rpc_read(0, pHandle, sizeof(CUipcEventHandle)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuIpcOpenEventHandle(CUevent* phEvent, CUipcEventHandle handle)
 {
+    maybe_copy_unified_arg(0, (void*)phEvent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuIpcOpenEventHandle) < 0 ||
         rpc_write(0, phEvent, sizeof(CUevent)) < 0 ||
@@ -4653,11 +6723,15 @@ CUresult cuIpcOpenEventHandle(CUevent* phEvent, CUipcEventHandle handle)
         rpc_read(0, phEvent, sizeof(CUevent)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phEvent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr)
 {
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuIpcGetMemHandle) < 0 ||
         rpc_write(0, pHandle, sizeof(CUipcMemHandle)) < 0 ||
@@ -4666,11 +6740,16 @@ CUresult cuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr)
         rpc_read(0, pHandle, sizeof(CUipcMemHandle)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuIpcOpenMemHandle_v2(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuIpcOpenMemHandle_v2) < 0 ||
         rpc_write(0, pdptr, sizeof(CUdeviceptr)) < 0 ||
@@ -4680,22 +6759,30 @@ CUresult cuIpcOpenMemHandle_v2(CUdeviceptr* pdptr, CUipcMemHandle handle, unsign
         rpc_read(0, pdptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuIpcCloseMemHandle(CUdeviceptr dptr)
 {
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuIpcCloseMemHandle) < 0 ||
         rpc_write(0, &dptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpy) < 0 ||
         rpc_write(0, &dst, sizeof(CUdeviceptr)) < 0 ||
@@ -4704,11 +6791,19 @@ CUresult cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstContext, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcContext, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyPeer) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4719,11 +6814,19 @@ CUresult cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr s
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstContext, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcContext, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)srcHost, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyHtoD_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4732,11 +6835,17 @@ CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t Byte
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)srcHost, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyDtoD_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4745,11 +6854,18 @@ CUresult cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t By
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount)
 {
+    maybe_copy_unified_arg(0, (void*)&dstArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyDtoA_v2) < 0 ||
         rpc_write(0, &dstArray, sizeof(CUarray)) < 0 ||
@@ -4759,11 +6875,19 @@ CUresult cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevi
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyAtoD_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4773,11 +6897,19 @@ CUresult cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffs
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyAtoH_v2(void* dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount)
 {
+    maybe_copy_unified_arg(0, (void*)dstHost, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyAtoH_v2) < 0 ||
         rpc_write(0, &dstHost, sizeof(void*)) < 0 ||
@@ -4787,11 +6919,20 @@ CUresult cuMemcpyAtoH_v2(void* dstHost, CUarray srcArray, size_t srcOffset, size
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dstHost, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount)
 {
+    maybe_copy_unified_arg(0, (void*)&dstArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyAtoA_v2) < 0 ||
         rpc_write(0, &dstArray, sizeof(CUarray)) < 0 ||
@@ -4802,11 +6943,20 @@ CUresult cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, s
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyAsync) < 0 ||
         rpc_write(0, &dst, sizeof(CUdeviceptr)) < 0 ||
@@ -4816,11 +6966,21 @@ CUresult cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstContext, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcContext, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyPeerAsync) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4832,11 +6992,21 @@ CUresult cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdevice
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstContext, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcContext, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)srcHost, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyHtoDAsync_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4846,11 +7016,19 @@ CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void* srcHost, size_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)srcHost, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemcpyDtoDAsync_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4860,11 +7038,18 @@ CUresult cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD8_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4873,11 +7058,17 @@ CUresult cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD16_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4886,11 +7077,17 @@ CUresult cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD32_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4899,11 +7096,19 @@ CUresult cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD2D8_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4914,11 +7119,21 @@ CUresult cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char u
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD2D16_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4929,11 +7144,21 @@ CUresult cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD2D32_v2) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4944,11 +7169,20 @@ CUresult cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int u
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD8Async) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4958,11 +7192,19 @@ CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUst
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD16Async) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4972,11 +7214,19 @@ CUresult cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CU
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD32Async) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -4986,11 +7236,21 @@ CUresult cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUst
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD2D8Async) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -5002,11 +7262,23 @@ CUresult cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD2D16Async) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -5018,11 +7290,23 @@ CUresult cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned sho
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemsetD2D32Async) < 0 ||
         rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 ||
@@ -5034,11 +7318,19 @@ CUresult cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuArrayCreate_v2(CUarray* pHandle, const CUDA_ARRAY_DESCRIPTOR* pAllocateArray)
 {
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pAllocateArray, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuArrayCreate_v2) < 0 ||
         rpc_write(0, pHandle, sizeof(CUarray)) < 0 ||
@@ -5047,11 +7339,15 @@ CUresult cuArrayCreate_v2(CUarray* pHandle, const CUDA_ARRAY_DESCRIPTOR* pAlloca
         rpc_read(0, pHandle, sizeof(CUarray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pAllocateArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuArrayGetDescriptor_v2(CUDA_ARRAY_DESCRIPTOR* pArrayDescriptor, CUarray hArray)
 {
+    maybe_copy_unified_arg(0, (void*)pArrayDescriptor, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuArrayGetDescriptor_v2) < 0 ||
         rpc_write(0, pArrayDescriptor, sizeof(CUDA_ARRAY_DESCRIPTOR)) < 0 ||
@@ -5060,11 +7356,15 @@ CUresult cuArrayGetDescriptor_v2(CUDA_ARRAY_DESCRIPTOR* pArrayDescriptor, CUarra
         rpc_read(0, pArrayDescriptor, sizeof(CUDA_ARRAY_DESCRIPTOR)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pArrayDescriptor, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES* sparseProperties, CUarray array)
 {
+    maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuArrayGetSparseProperties) < 0 ||
         rpc_write(0, sparseProperties, sizeof(CUDA_ARRAY_SPARSE_PROPERTIES)) < 0 ||
@@ -5073,11 +7373,15 @@ CUresult cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES* sparseProperti
         rpc_read(0, sparseProperties, sizeof(CUDA_ARRAY_SPARSE_PROPERTIES)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES* sparseProperties, CUmipmappedArray mipmap)
 {
+    maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMipmappedArrayGetSparseProperties) < 0 ||
         rpc_write(0, sparseProperties, sizeof(CUDA_ARRAY_SPARSE_PROPERTIES)) < 0 ||
@@ -5086,11 +7390,16 @@ CUresult cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES* spars
         rpc_read(0, sparseProperties, sizeof(CUDA_ARRAY_SPARSE_PROPERTIES)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS* memoryRequirements, CUarray array, CUdevice device)
 {
+    maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuArrayGetMemoryRequirements) < 0 ||
         rpc_write(0, memoryRequirements, sizeof(CUDA_ARRAY_MEMORY_REQUIREMENTS)) < 0 ||
@@ -5100,11 +7409,17 @@ CUresult cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS* memoryRequ
         rpc_read(0, memoryRequirements, sizeof(CUDA_ARRAY_MEMORY_REQUIREMENTS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS* memoryRequirements, CUmipmappedArray mipmap, CUdevice device)
 {
+    maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMipmappedArrayGetMemoryRequirements) < 0 ||
         rpc_write(0, memoryRequirements, sizeof(CUDA_ARRAY_MEMORY_REQUIREMENTS)) < 0 ||
@@ -5114,11 +7429,17 @@ CUresult cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS* m
         rpc_read(0, memoryRequirements, sizeof(CUDA_ARRAY_MEMORY_REQUIREMENTS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuArrayGetPlane(CUarray* pPlaneArray, CUarray hArray, unsigned int planeIdx)
 {
+    maybe_copy_unified_arg(0, (void*)pPlaneArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&planeIdx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuArrayGetPlane) < 0 ||
         rpc_write(0, pPlaneArray, sizeof(CUarray)) < 0 ||
@@ -5128,22 +7449,29 @@ CUresult cuArrayGetPlane(CUarray* pPlaneArray, CUarray hArray, unsigned int plan
         rpc_read(0, pPlaneArray, sizeof(CUarray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pPlaneArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&planeIdx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuArrayDestroy(CUarray hArray)
 {
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuArrayDestroy) < 0 ||
         rpc_write(0, &hArray, sizeof(CUarray)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuArray3DCreate_v2(CUarray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pAllocateArray)
 {
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pAllocateArray, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuArray3DCreate_v2) < 0 ||
         rpc_write(0, pHandle, sizeof(CUarray)) < 0 ||
@@ -5152,11 +7480,15 @@ CUresult cuArray3DCreate_v2(CUarray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pAl
         rpc_read(0, pHandle, sizeof(CUarray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pAllocateArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuArray3DGetDescriptor_v2(CUDA_ARRAY3D_DESCRIPTOR* pArrayDescriptor, CUarray hArray)
 {
+    maybe_copy_unified_arg(0, (void*)pArrayDescriptor, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuArray3DGetDescriptor_v2) < 0 ||
         rpc_write(0, pArrayDescriptor, sizeof(CUDA_ARRAY3D_DESCRIPTOR)) < 0 ||
@@ -5165,11 +7497,16 @@ CUresult cuArray3DGetDescriptor_v2(CUDA_ARRAY3D_DESCRIPTOR* pArrayDescriptor, CU
         rpc_read(0, pArrayDescriptor, sizeof(CUDA_ARRAY3D_DESCRIPTOR)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pArrayDescriptor, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMipmappedArrayCreate(CUmipmappedArray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc, unsigned int numMipmapLevels)
 {
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pMipmappedArrayDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numMipmapLevels, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMipmappedArrayCreate) < 0 ||
         rpc_write(0, pHandle, sizeof(CUmipmappedArray)) < 0 ||
@@ -5179,11 +7516,17 @@ CUresult cuMipmappedArrayCreate(CUmipmappedArray* pHandle, const CUDA_ARRAY3D_DE
         rpc_read(0, pHandle, sizeof(CUmipmappedArray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pMipmappedArrayDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numMipmapLevels, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMipmappedArrayGetLevel(CUarray* pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level)
 {
+    maybe_copy_unified_arg(0, (void*)pLevelArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMipmappedArrayGetLevel) < 0 ||
         rpc_write(0, pLevelArray, sizeof(CUarray)) < 0 ||
@@ -5193,22 +7536,32 @@ CUresult cuMipmappedArrayGetLevel(CUarray* pLevelArray, CUmipmappedArray hMipmap
         rpc_read(0, pLevelArray, sizeof(CUarray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pLevelArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray)
 {
+    maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMipmappedArrayDestroy) < 0 ||
         rpc_write(0, &hMipmappedArray, sizeof(CUmipmappedArray)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags)
 {
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&alignment, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAddressReserve) < 0 ||
         rpc_write(0, ptr, sizeof(CUdeviceptr)) < 0 ||
@@ -5220,11 +7573,18 @@ CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment, CU
         rpc_read(0, ptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&alignment, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size)
 {
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAddressFree) < 0 ||
         rpc_write(0, &ptr, sizeof(CUdeviceptr)) < 0 ||
@@ -5232,11 +7592,17 @@ CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size, const CUmemAllocationProp* prop, unsigned long long flags)
 {
+    maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemCreate) < 0 ||
         rpc_write(0, handle, sizeof(CUmemGenericAllocationHandle)) < 0 ||
@@ -5247,22 +7613,33 @@ CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size, const CU
         rpc_read(0, handle, sizeof(CUmemGenericAllocationHandle)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemRelease(CUmemGenericAllocationHandle handle)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemRelease) < 0 ||
         rpc_write(0, &handle, sizeof(CUmemGenericAllocationHandle)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags)
 {
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemMap) < 0 ||
         rpc_write(0, &ptr, sizeof(CUdeviceptr)) < 0 ||
@@ -5273,11 +7650,19 @@ CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAlloc
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemMapArrayAsync(CUarrayMapInfo* mapInfoList, unsigned int count, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)mapInfoList, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemMapArrayAsync) < 0 ||
         rpc_write(0, mapInfoList, sizeof(CUarrayMapInfo)) < 0 ||
@@ -5287,11 +7672,16 @@ CUresult cuMemMapArrayAsync(CUarrayMapInfo* mapInfoList, unsigned int count, CUs
         rpc_read(0, mapInfoList, sizeof(CUarrayMapInfo)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)mapInfoList, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemUnmap(CUdeviceptr ptr, size_t size)
 {
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemUnmap) < 0 ||
         rpc_write(0, &ptr, sizeof(CUdeviceptr)) < 0 ||
@@ -5299,11 +7689,17 @@ CUresult cuMemUnmap(CUdeviceptr ptr, size_t size)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc* desc, size_t count)
 {
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemSetAccess) < 0 ||
         rpc_write(0, &ptr, sizeof(CUdeviceptr)) < 0 ||
@@ -5313,11 +7709,18 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc* des
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemGetAccess(unsigned long long* flags, const CUmemLocation* location, CUdeviceptr ptr)
 {
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)location, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemGetAccess) < 0 ||
         rpc_write(0, flags, sizeof(unsigned long long)) < 0 ||
@@ -5327,11 +7730,17 @@ CUresult cuMemGetAccess(unsigned long long* flags, const CUmemLocation* location
         rpc_read(0, flags, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)location, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemGetAllocationGranularity(size_t* granularity, const CUmemAllocationProp* prop, CUmemAllocationGranularity_flags option)
 {
+    maybe_copy_unified_arg(0, (void*)granularity, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&option, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemGetAllocationGranularity) < 0 ||
         rpc_write(0, granularity, sizeof(size_t)) < 0 ||
@@ -5341,11 +7750,16 @@ CUresult cuMemGetAllocationGranularity(size_t* granularity, const CUmemAllocatio
         rpc_read(0, granularity, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)granularity, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&option, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp* prop, CUmemGenericAllocationHandle handle)
 {
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemGetAllocationPropertiesFromHandle) < 0 ||
         rpc_write(0, prop, sizeof(CUmemAllocationProp)) < 0 ||
@@ -5354,11 +7768,15 @@ CUresult cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp* prop, CUmem
         rpc_read(0, prop, sizeof(CUmemAllocationProp)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemFreeAsync) < 0 ||
         rpc_write(0, &dptr, sizeof(CUdeviceptr)) < 0 ||
@@ -5366,11 +7784,16 @@ CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAllocAsync(CUdeviceptr* dptr, size_t bytesize, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAllocAsync) < 0 ||
         rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 ||
@@ -5380,11 +7803,16 @@ CUresult cuMemAllocAsync(CUdeviceptr* dptr, size_t bytesize, CUstream hStream)
         rpc_read(0, dptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep)
 {
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&minBytesToKeep, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemPoolTrimTo) < 0 ||
         rpc_write(0, &pool, sizeof(CUmemoryPool)) < 0 ||
@@ -5392,11 +7820,16 @@ CUresult cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&minBytesToKeep, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc* map, size_t count)
 {
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)map, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemPoolSetAccess) < 0 ||
         rpc_write(0, &pool, sizeof(CUmemoryPool)) < 0 ||
@@ -5405,11 +7838,17 @@ CUresult cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc* map, size_
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)map, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemPoolGetAccess(CUmemAccess_flags* flags, CUmemoryPool memPool, CUmemLocation* location)
 {
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)location, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemPoolGetAccess) < 0 ||
         rpc_write(0, flags, sizeof(CUmemAccess_flags)) < 0 ||
@@ -5420,11 +7859,16 @@ CUresult cuMemPoolGetAccess(CUmemAccess_flags* flags, CUmemoryPool memPool, CUme
         rpc_read(0, location, sizeof(CUmemLocation)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)location, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemPoolCreate(CUmemoryPool* pool, const CUmemPoolProps* poolProps)
 {
+    maybe_copy_unified_arg(0, (void*)pool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)poolProps, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemPoolCreate) < 0 ||
         rpc_write(0, pool, sizeof(CUmemoryPool)) < 0 ||
@@ -5433,22 +7877,30 @@ CUresult cuMemPoolCreate(CUmemoryPool* pool, const CUmemPoolProps* poolProps)
         rpc_read(0, pool, sizeof(CUmemoryPool)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)poolProps, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemPoolDestroy(CUmemoryPool pool)
 {
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemPoolDestroy) < 0 ||
         rpc_write(0, &pool, sizeof(CUmemoryPool)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAllocFromPoolAsync(CUdeviceptr* dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAllocFromPoolAsync) < 0 ||
         rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 ||
@@ -5459,11 +7911,17 @@ CUresult cuMemAllocFromPoolAsync(CUdeviceptr* dptr, size_t bytesize, CUmemoryPoo
         rpc_read(0, dptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemPoolExportPointer(CUmemPoolPtrExportData* shareData_out, CUdeviceptr ptr)
 {
+    maybe_copy_unified_arg(0, (void*)shareData_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemPoolExportPointer) < 0 ||
         rpc_write(0, shareData_out, sizeof(CUmemPoolPtrExportData)) < 0 ||
@@ -5472,11 +7930,16 @@ CUresult cuMemPoolExportPointer(CUmemPoolPtrExportData* shareData_out, CUdevicep
         rpc_read(0, shareData_out, sizeof(CUmemPoolPtrExportData)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)shareData_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemPoolImportPointer(CUdeviceptr* ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData* shareData)
 {
+    maybe_copy_unified_arg(0, (void*)ptr_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)shareData, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemPoolImportPointer) < 0 ||
         rpc_write(0, ptr_out, sizeof(CUdeviceptr)) < 0 ||
@@ -5487,11 +7950,18 @@ CUresult cuMemPoolImportPointer(CUdeviceptr* ptr_out, CUmemoryPool pool, CUmemPo
         rpc_read(0, shareData, sizeof(CUmemPoolPtrExportData)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)ptr_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)shareData, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemPrefetchAsync) < 0 ||
         rpc_write(0, &devPtr, sizeof(CUdeviceptr)) < 0 ||
@@ -5501,11 +7971,19 @@ CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device)
 {
+    maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&advice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemAdvise) < 0 ||
         rpc_write(0, &devPtr, sizeof(CUdeviceptr)) < 0 ||
@@ -5515,11 +7993,21 @@ CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUde
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&advice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuMemRangeGetAttributes(void** data, size_t* dataSizes, CUmem_range_attribute* attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count)
 {
+    maybe_copy_unified_arg(0, (void*)data, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dataSizes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuMemRangeGetAttributes) < 0 ||
         rpc_write(0, data, sizeof(void*)) < 0 ||
@@ -5534,11 +8022,20 @@ CUresult cuMemRangeGetAttributes(void** data, size_t* dataSizes, CUmem_range_att
         rpc_read(0, attributes, sizeof(CUmem_range_attribute)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)data, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dataSizes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuPointerSetAttribute(const void* value, CUpointer_attribute attribute, CUdeviceptr ptr)
 {
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attribute, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuPointerSetAttribute) < 0 ||
         rpc_write(0, &value, sizeof(const void*)) < 0 ||
@@ -5547,11 +8044,18 @@ CUresult cuPointerSetAttribute(const void* value, CUpointer_attribute attribute,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attribute, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute* attributes, void** data, CUdeviceptr ptr)
 {
+    maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)data, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuPointerGetAttributes) < 0 ||
         rpc_write(0, &numAttributes, sizeof(unsigned int)) < 0 ||
@@ -5563,11 +8067,17 @@ CUresult cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute*
         rpc_read(0, data, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)data, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamCreate(CUstream* phStream, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)phStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamCreate) < 0 ||
         rpc_write(0, phStream, sizeof(CUstream)) < 0 ||
@@ -5576,11 +8086,16 @@ CUresult cuStreamCreate(CUstream* phStream, unsigned int Flags)
         rpc_read(0, phStream, sizeof(CUstream)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamCreateWithPriority(CUstream* phStream, unsigned int flags, int priority)
 {
+    maybe_copy_unified_arg(0, (void*)phStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&priority, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamCreateWithPriority) < 0 ||
         rpc_write(0, phStream, sizeof(CUstream)) < 0 ||
@@ -5590,11 +8105,16 @@ CUresult cuStreamCreateWithPriority(CUstream* phStream, unsigned int flags, int
         rpc_read(0, phStream, sizeof(CUstream)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&priority, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamGetPriority(CUstream hStream, int* priority)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)priority, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamGetPriority) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5603,11 +8123,15 @@ CUresult cuStreamGetPriority(CUstream hStream, int* priority)
         rpc_read(0, priority, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)priority, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamGetFlags(CUstream hStream, unsigned int* flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamGetFlags) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5616,11 +8140,15 @@ CUresult cuStreamGetFlags(CUstream hStream, unsigned int* flags)
         rpc_read(0, flags, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamGetId(CUstream hStream, unsigned long long* streamId)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamGetId) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5629,11 +8157,15 @@ CUresult cuStreamGetId(CUstream hStream, unsigned long long* streamId)
         rpc_read(0, streamId, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamGetCtx) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5642,11 +8174,16 @@ CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx)
         rpc_read(0, pctx, sizeof(CUcontext)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamWaitEvent) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5655,11 +8192,16 @@ CUresult cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamBeginCapture_v2) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5667,11 +8209,14 @@ CUresult cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode* mode)
 {
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuThreadExchangeStreamCaptureMode) < 0 ||
         rpc_write(0, mode, sizeof(CUstreamCaptureMode)) < 0 ||
@@ -5679,11 +8224,14 @@ CUresult cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode* mode)
         rpc_read(0, mode, sizeof(CUstreamCaptureMode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamEndCapture(CUstream hStream, CUgraph* phGraph)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamEndCapture) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5692,11 +8240,15 @@ CUresult cuStreamEndCapture(CUstream hStream, CUgraph* phGraph)
         rpc_read(0, phGraph, sizeof(CUgraph)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* captureStatus)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)captureStatus, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamIsCapturing) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5705,11 +8257,17 @@ CUresult cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* captureSta
         rpc_read(0, captureStatus, sizeof(CUstreamCaptureStatus)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)captureStatus, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* dependencies, size_t numDependencies, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamUpdateCaptureDependencies) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5720,11 +8278,19 @@ CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* depend
         rpc_read(0, dependencies, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamAttachMemAsync) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5734,44 +8300,56 @@ CUresult cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t lengt
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamQuery(CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamQuery) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamSynchronize(CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamSynchronize) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamDestroy_v2(CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamDestroy_v2) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamCopyAttributes(CUstream dst, CUstream src)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamCopyAttributes) < 0 ||
         rpc_write(0, &dst, sizeof(CUstream)) < 0 ||
@@ -5779,11 +8357,16 @@ CUresult cuStreamCopyAttributes(CUstream dst, CUstream src)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue* value_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamGetAttribute) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5793,11 +8376,17 @@ CUresult cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAtt
         rpc_read(0, value_out, sizeof(CUstreamAttrValue)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue* value)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamSetAttribute) < 0 ||
         rpc_write(0, &hStream, sizeof(CUstream)) < 0 ||
@@ -5806,11 +8395,16 @@ CUresult cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuEventCreate(CUevent* phEvent, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)phEvent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuEventCreate) < 0 ||
         rpc_write(0, phEvent, sizeof(CUevent)) < 0 ||
@@ -5819,11 +8413,15 @@ CUresult cuEventCreate(CUevent* phEvent, unsigned int Flags)
         rpc_read(0, phEvent, sizeof(CUevent)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phEvent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuEventRecord(CUevent hEvent, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuEventRecord) < 0 ||
         rpc_write(0, &hEvent, sizeof(CUevent)) < 0 ||
@@ -5831,11 +8429,16 @@ CUresult cuEventRecord(CUevent hEvent, CUstream hStream)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuEventRecordWithFlags) < 0 ||
         rpc_write(0, &hEvent, sizeof(CUevent)) < 0 ||
@@ -5844,44 +8447,56 @@ CUresult cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int f
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuEventQuery(CUevent hEvent)
 {
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuEventQuery) < 0 ||
         rpc_write(0, &hEvent, sizeof(CUevent)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuEventSynchronize(CUevent hEvent)
 {
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuEventSynchronize) < 0 ||
         rpc_write(0, &hEvent, sizeof(CUevent)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuEventDestroy_v2(CUevent hEvent)
 {
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuEventDestroy_v2) < 0 ||
         rpc_write(0, &hEvent, sizeof(CUevent)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd)
 {
+    maybe_copy_unified_arg(0, (void*)pMilliseconds, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStart, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hEnd, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuEventElapsedTime) < 0 ||
         rpc_write(0, pMilliseconds, sizeof(float)) < 0 ||
@@ -5891,11 +8506,16 @@ CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd)
         rpc_read(0, pMilliseconds, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pMilliseconds, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStart, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hEnd, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuImportExternalMemory(CUexternalMemory* extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC* memHandleDesc)
 {
+    maybe_copy_unified_arg(0, (void*)extMem_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)memHandleDesc, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuImportExternalMemory) < 0 ||
         rpc_write(0, extMem_out, sizeof(CUexternalMemory)) < 0 ||
@@ -5904,11 +8524,16 @@ CUresult cuImportExternalMemory(CUexternalMemory* extMem_out, const CUDA_EXTERNA
         rpc_read(0, extMem_out, sizeof(CUexternalMemory)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)extMem_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)memHandleDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuExternalMemoryGetMappedBuffer(CUdeviceptr* devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC* bufferDesc)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bufferDesc, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuExternalMemoryGetMappedBuffer) < 0 ||
         rpc_write(0, devPtr, sizeof(CUdeviceptr)) < 0 ||
@@ -5918,11 +8543,17 @@ CUresult cuExternalMemoryGetMappedBuffer(CUdeviceptr* devPtr, CUexternalMemory e
         rpc_read(0, devPtr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bufferDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray* mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* mipmapDesc)
 {
+    maybe_copy_unified_arg(0, (void*)mipmap, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mipmapDesc, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuExternalMemoryGetMappedMipmappedArray) < 0 ||
         rpc_write(0, mipmap, sizeof(CUmipmappedArray)) < 0 ||
@@ -5932,22 +8563,29 @@ CUresult cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray* mipmap, CUext
         rpc_read(0, mipmap, sizeof(CUmipmappedArray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)mipmap, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mipmapDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDestroyExternalMemory(CUexternalMemory extMem)
 {
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDestroyExternalMemory) < 0 ||
         rpc_write(0, &extMem, sizeof(CUexternalMemory)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuImportExternalSemaphore(CUexternalSemaphore* extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* semHandleDesc)
 {
+    maybe_copy_unified_arg(0, (void*)extSem_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)semHandleDesc, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuImportExternalSemaphore) < 0 ||
         rpc_write(0, extSem_out, sizeof(CUexternalSemaphore)) < 0 ||
@@ -5956,11 +8594,17 @@ CUresult cuImportExternalSemaphore(CUexternalSemaphore* extSem_out, const CUDA_E
         rpc_read(0, extSem_out, sizeof(CUexternalSemaphore)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)extSem_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)semHandleDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuSignalExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray, unsigned int numExtSems, CUstream stream)
 {
+    maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuSignalExternalSemaphoresAsync) < 0 ||
         rpc_write(0, &extSemArray, sizeof(const CUexternalSemaphore*)) < 0 ||
@@ -5970,11 +8614,19 @@ CUresult cuSignalExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuWaitExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray, unsigned int numExtSems, CUstream stream)
 {
+    maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuWaitExternalSemaphoresAsync) < 0 ||
         rpc_write(0, &extSemArray, sizeof(const CUexternalSemaphore*)) < 0 ||
@@ -5984,22 +8636,32 @@ CUresult cuWaitExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, c
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDestroyExternalSemaphore(CUexternalSemaphore extSem)
 {
+    maybe_copy_unified_arg(0, (void*)&extSem, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDestroyExternalSemaphore) < 0 ||
         rpc_write(0, &extSem, sizeof(CUexternalSemaphore)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&extSem, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamWaitValue32_v2) < 0 ||
         rpc_write(0, &stream, sizeof(CUstream)) < 0 ||
@@ -6009,11 +8671,19 @@ CUresult cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t va
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamWaitValue64_v2) < 0 ||
         rpc_write(0, &stream, sizeof(CUstream)) < 0 ||
@@ -6023,11 +8693,19 @@ CUresult cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t va
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamWriteValue32_v2) < 0 ||
         rpc_write(0, &stream, sizeof(CUstream)) < 0 ||
@@ -6037,11 +8715,19 @@ CUresult cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t v
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamWriteValue64_v2) < 0 ||
         rpc_write(0, &stream, sizeof(CUstream)) < 0 ||
@@ -6051,11 +8737,19 @@ CUresult cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t v
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams* paramArray, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)paramArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuStreamBatchMemOp_v2) < 0 ||
         rpc_write(0, &stream, sizeof(CUstream)) < 0 ||
@@ -6066,11 +8760,18 @@ CUresult cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatc
         rpc_read(0, paramArray, sizeof(CUstreamBatchMemOpParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)paramArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc)
 {
+    maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuFuncGetAttribute) < 0 ||
         rpc_write(0, pi, sizeof(int)) < 0 ||
@@ -6080,11 +8781,17 @@ CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfu
         rpc_read(0, pi, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuFuncSetAttribute) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6093,11 +8800,16 @@ CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int v
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuFuncSetCacheConfig) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6105,11 +8817,15 @@ CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuFuncSetSharedMemConfig) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6117,11 +8833,15 @@ CUresult cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuFuncGetModule(CUmodule* hmod, CUfunction hfunc)
 {
+    maybe_copy_unified_arg(0, (void*)hmod, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuFuncGetModule) < 0 ||
         rpc_write(0, hmod, sizeof(CUmodule)) < 0 ||
@@ -6130,11 +8850,24 @@ CUresult cuFuncGetModule(CUmodule* hmod, CUfunction hfunc)
         rpc_read(0, hmod, sizeof(CUmodule)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)hmod, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra)
 {
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gridDimX, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gridDimY, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gridDimZ, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockDimX, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockDimY, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockDimZ, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&sharedMemBytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)kernelParams, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)extra, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLaunchKernel) < 0 ||
         rpc_write(0, &f, sizeof(CUfunction)) < 0 ||
@@ -6151,11 +8884,32 @@ CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDi
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gridDimX, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gridDimY, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gridDimZ, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockDimX, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockDimY, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockDimZ, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&sharedMemBytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)kernelParams, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)extra, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams)
 {
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gridDimX, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gridDimY, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gridDimZ, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockDimX, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockDimY, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockDimZ, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&sharedMemBytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)kernelParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLaunchCooperativeKernel) < 0 ||
         rpc_write(0, &f, sizeof(CUfunction)) < 0 ||
@@ -6172,11 +8926,24 @@ CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned
         rpc_read(0, kernelParams, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gridDimX, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gridDimY, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gridDimZ, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockDimX, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockDimY, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockDimZ, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&sharedMemBytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)kernelParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)launchParamsList, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDevices, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLaunchCooperativeKernelMultiDevice) < 0 ||
         rpc_write(0, launchParamsList, sizeof(CUDA_LAUNCH_PARAMS)) < 0 ||
@@ -6186,11 +8953,18 @@ CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsLi
         rpc_read(0, launchParamsList, sizeof(CUDA_LAUNCH_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)launchParamsList, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDevices, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&z, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuFuncSetBlockShape) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6200,11 +8974,17 @@ CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&z, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bytes, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuFuncSetSharedSize) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6212,11 +8992,15 @@ CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuParamSetSize(CUfunction hfunc, unsigned int numbytes)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numbytes, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuParamSetSize) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6224,11 +9008,16 @@ CUresult cuParamSetSize(CUfunction hfunc, unsigned int numbytes)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numbytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuParamSeti(CUfunction hfunc, int offset, unsigned int value)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuParamSeti) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6237,11 +9026,17 @@ CUresult cuParamSeti(CUfunction hfunc, int offset, unsigned int value)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuParamSetf(CUfunction hfunc, int offset, float value)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuParamSetf) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6250,22 +9045,30 @@ CUresult cuParamSetf(CUfunction hfunc, int offset, float value)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLaunch(CUfunction f)
 {
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLaunch) < 0 ||
         rpc_write(0, &f, sizeof(CUfunction)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height)
 {
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&grid_width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&grid_height, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLaunchGrid) < 0 ||
         rpc_write(0, &f, sizeof(CUfunction)) < 0 ||
@@ -6274,11 +9077,18 @@ CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&grid_width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&grid_height, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&grid_width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&grid_height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuLaunchGridAsync) < 0 ||
         rpc_write(0, &f, sizeof(CUfunction)) < 0 ||
@@ -6288,11 +9098,18 @@ CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstre
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&grid_width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&grid_height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&texunit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuParamSetTexRef) < 0 ||
         rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 ||
@@ -6301,11 +9118,16 @@ CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&texunit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphCreate(CUgraph* phGraph, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphCreate) < 0 ||
         rpc_write(0, phGraph, sizeof(CUgraph)) < 0 ||
@@ -6314,11 +9136,18 @@ CUresult cuGraphCreate(CUgraph* phGraph, unsigned int flags)
         rpc_read(0, phGraph, sizeof(CUgraph)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddKernelNode_v2(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddKernelNode_v2) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6330,11 +9159,18 @@ CUresult cuGraphAddKernelNode_v2(CUgraphNode* phGraphNode, CUgraph hGraph, const
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphKernelNodeGetParams_v2(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphKernelNodeGetParams_v2) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6343,11 +9179,15 @@ CUresult cuGraphKernelNodeGetParams_v2(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAM
         rpc_read(0, nodeParams, sizeof(CUDA_KERNEL_NODE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphKernelNodeSetParams_v2(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphKernelNodeSetParams_v2) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6355,11 +9195,19 @@ CUresult cuGraphKernelNodeSetParams_v2(CUgraphNode hNode, const CUDA_KERNEL_NODE
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddMemcpyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_MEMCPY3D* copyParams, CUcontext ctx)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)copyParams, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddMemcpyNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6372,11 +9220,19 @@ CUresult cuGraphAddMemcpyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CU
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)copyParams, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphMemcpyNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6385,11 +9241,15 @@ CUresult cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D* nodeParams
         rpc_read(0, nodeParams, sizeof(CUDA_MEMCPY3D)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphMemcpyNodeSetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6397,11 +9257,19 @@ CUresult cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D* node
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddMemsetNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS* memsetParams, CUcontext ctx)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)memsetParams, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddMemsetNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6414,11 +9282,19 @@ CUresult cuGraphAddMemsetNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CU
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)memsetParams, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphMemsetNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6427,11 +9303,15 @@ CUresult cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS*
         rpc_read(0, nodeParams, sizeof(CUDA_MEMSET_NODE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphMemsetNodeSetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6439,11 +9319,18 @@ CUresult cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PA
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddHostNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddHostNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6455,11 +9342,18 @@ CUresult cuGraphAddHostNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgr
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphHostNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6468,11 +9362,15 @@ CUresult cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS* node
         rpc_read(0, nodeParams, sizeof(CUDA_HOST_NODE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphHostNodeSetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6480,11 +9378,18 @@ CUresult cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddChildGraphNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUgraph childGraph)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddChildGraphNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6496,11 +9401,18 @@ CUresult cuGraphAddChildGraphNode(CUgraphNode* phGraphNode, CUgraph hGraph, cons
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph* phGraph)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphChildGraphNodeGetGraph) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6509,11 +9421,17 @@ CUresult cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph* phGraph)
         rpc_read(0, phGraph, sizeof(CUgraph)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddEmptyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddEmptyNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6524,11 +9442,20 @@ CUresult cuGraphAddEmptyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUg
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddEventRecordNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUevent event)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddEventRecordNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6540,11 +9467,18 @@ CUresult cuGraphAddEventRecordNode(CUgraphNode* phGraphNode, CUgraph hGraph, con
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent* event_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphEventRecordNodeGetEvent) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6553,11 +9487,15 @@ CUresult cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent* event_out)
         rpc_read(0, event_out, sizeof(CUevent)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphEventRecordNodeSetEvent) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6565,11 +9503,18 @@ CUresult cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddEventWaitNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUevent event)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddEventWaitNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6581,11 +9526,18 @@ CUresult cuGraphAddEventWaitNode(CUgraphNode* phGraphNode, CUgraph hGraph, const
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent* event_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphEventWaitNodeGetEvent) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6594,11 +9546,15 @@ CUresult cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent* event_out)
         rpc_read(0, event_out, sizeof(CUevent)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphEventWaitNodeSetEvent) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6606,11 +9562,18 @@ CUresult cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddExternalSemaphoresSignalNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddExternalSemaphoresSignalNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6622,11 +9585,18 @@ CUresult cuGraphAddExternalSemaphoresSignalNode(CUgraphNode* phGraphNode, CUgrap
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* params_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExternalSemaphoresSignalNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6635,11 +9605,15 @@ CUresult cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EX
         rpc_read(0, params_out, sizeof(CUDA_EXT_SEM_SIGNAL_NODE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExternalSemaphoresSignalNodeSetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6647,11 +9621,18 @@ CUresult cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const C
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddExternalSemaphoresWaitNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddExternalSemaphoresWaitNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6663,11 +9644,18 @@ CUresult cuGraphAddExternalSemaphoresWaitNode(CUgraphNode* phGraphNode, CUgraph
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS* params_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExternalSemaphoresWaitNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6676,11 +9664,15 @@ CUresult cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_
         rpc_read(0, params_out, sizeof(CUDA_EXT_SEM_WAIT_NODE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExternalSemaphoresWaitNodeSetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6688,11 +9680,18 @@ CUresult cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUD
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddBatchMemOpNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddBatchMemOpNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6704,11 +9703,18 @@ CUresult cuGraphAddBatchMemOpNode(CUgraphNode* phGraphNode, CUgraph hGraph, cons
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphBatchMemOpNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6717,11 +9723,15 @@ CUresult cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NOD
         rpc_read(0, nodeParams_out, sizeof(CUDA_BATCH_MEM_OP_NODE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphBatchMemOpNodeSetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6729,11 +9739,16 @@ CUresult cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecBatchMemOpNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -6742,11 +9757,19 @@ CUresult cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddMemAllocNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddMemAllocNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6759,11 +9782,18 @@ CUresult cuGraphAddMemAllocNode(CUgraphNode* phGraphNode, CUgraph hGraph, const
         rpc_read(0, nodeParams, sizeof(CUDA_MEM_ALLOC_NODE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS* params_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphMemAllocNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6772,11 +9802,20 @@ CUresult cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PAR
         rpc_read(0, params_out, sizeof(CUDA_MEM_ALLOC_NODE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddMemFreeNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUdeviceptr dptr)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numDependencies); i++)
+       maybe_copy_unified_arg(0, (void*)&dependencies[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddMemFreeNode) < 0 ||
         rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
@@ -6788,11 +9827,20 @@ CUresult cuGraphAddMemFreeNode(CUgraphNode* phGraphNode, CUgraph hGraph, const C
         rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numDependencies); i++)
+       maybe_copy_unified_arg(0, (void*)&dependencies[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr* dptr_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dptr_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphMemFreeNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6801,22 +9849,28 @@ CUresult cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr* dptr_out)
         rpc_read(0, dptr_out, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dptr_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGraphMemTrim(CUdevice device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGraphMemTrim) < 0 ||
         rpc_write(0, &device, sizeof(CUdevice)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphClone(CUgraph* phGraphClone, CUgraph originalGraph)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphClone, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&originalGraph, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphClone) < 0 ||
         rpc_write(0, phGraphClone, sizeof(CUgraph)) < 0 ||
@@ -6825,11 +9879,16 @@ CUresult cuGraphClone(CUgraph* phGraphClone, CUgraph originalGraph)
         rpc_read(0, phGraphClone, sizeof(CUgraph)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphClone, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&originalGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphNodeFindInClone(CUgraphNode* phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph)
 {
+    maybe_copy_unified_arg(0, (void*)phNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOriginalNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hClonedGraph, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphNodeFindInClone) < 0 ||
         rpc_write(0, phNode, sizeof(CUgraphNode)) < 0 ||
@@ -6839,11 +9898,16 @@ CUresult cuGraphNodeFindInClone(CUgraphNode* phNode, CUgraphNode hOriginalNode,
         rpc_read(0, phNode, sizeof(CUgraphNode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOriginalNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hClonedGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType* type)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)type, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphNodeGetType) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6852,11 +9916,16 @@ CUresult cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType* type)
         rpc_read(0, type, sizeof(CUgraphNodeType)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)type, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphGetNodes(CUgraph hGraph, CUgraphNode* nodes, size_t* numNodes)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numNodes, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphGetNodes) < 0 ||
         rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 ||
@@ -6867,11 +9936,17 @@ CUresult cuGraphGetNodes(CUgraph hGraph, CUgraphNode* nodes, size_t* numNodes)
         rpc_read(0, numNodes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numNodes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_t* numRootNodes)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)rootNodes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numRootNodes, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphGetRootNodes) < 0 ||
         rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 ||
@@ -6882,11 +9957,18 @@ CUresult cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_t* num
         rpc_read(0, numRootNodes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)rootNodes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numRootNodes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from, CUgraphNode* to, size_t* numEdges)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numEdges, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphGetEdges) < 0 ||
         rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 ||
@@ -6899,11 +9981,18 @@ CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from, CUgraphNode* to, siz
         rpc_read(0, numEdges, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numEdges, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, size_t* numDependencies)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numDependencies, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphNodeGetDependencies) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6914,11 +10003,17 @@ CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies
         rpc_read(0, numDependencies, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numDependencies, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, size_t* numDependentNodes)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependentNodes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numDependentNodes, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphNodeGetDependentNodes) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -6929,11 +10024,18 @@ CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentN
         rpc_read(0, numDependentNodes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependentNodes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numDependentNodes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from, const CUgraphNode* to, size_t numDependencies)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphAddDependencies) < 0 ||
         rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 ||
@@ -6943,11 +10045,19 @@ CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from, const C
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from, const CUgraphNode* to, size_t numDependencies)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphRemoveDependencies) < 0 ||
         rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 ||
@@ -6957,22 +10067,31 @@ CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from, cons
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphDestroyNode(CUgraphNode hNode)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphDestroyNode) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphInstantiateWithFlags(CUgraphExec* phGraphExec, CUgraph hGraph, unsigned long long flags)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphInstantiateWithFlags) < 0 ||
         rpc_write(0, phGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -6982,11 +10101,17 @@ CUresult cuGraphInstantiateWithFlags(CUgraphExec* phGraphExec, CUgraph hGraph, u
         rpc_read(0, phGraphExec, sizeof(CUgraphExec)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphInstantiateWithParams(CUgraphExec* phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS* instantiateParams)
 {
+    maybe_copy_unified_arg(0, (void*)phGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)instantiateParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphInstantiateWithParams) < 0 ||
         rpc_write(0, phGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -6997,11 +10122,16 @@ CUresult cuGraphInstantiateWithParams(CUgraphExec* phGraphExec, CUgraph hGraph,
         rpc_read(0, instantiateParams, sizeof(CUDA_GRAPH_INSTANTIATE_PARAMS)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)instantiateParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecGetFlags(CUgraphExec hGraphExec, cuuint64_t* flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecGetFlags) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7010,11 +10140,16 @@ CUresult cuGraphExecGetFlags(CUgraphExec hGraphExec, cuuint64_t* flags)
         rpc_read(0, flags, sizeof(cuuint64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecKernelNodeSetParams_v2(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecKernelNodeSetParams_v2) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7023,11 +10158,18 @@ CUresult cuGraphExecKernelNodeSetParams_v2(CUgraphExec hGraphExec, CUgraphNode h
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D* copyParams, CUcontext ctx)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)copyParams, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecMemcpyNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7037,11 +10179,19 @@ CUresult cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNod
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)copyParams, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS* memsetParams, CUcontext ctx)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)memsetParams, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecMemsetNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7051,11 +10201,18 @@ CUresult cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNod
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)memsetParams, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecHostNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7064,11 +10221,17 @@ CUresult cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecChildGraphNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7077,11 +10240,17 @@ CUresult cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecEventRecordNodeSetEvent) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7090,11 +10259,17 @@ CUresult cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecEventWaitNodeSetEvent) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7103,11 +10278,17 @@ CUresult cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hN
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecExternalSemaphoresSignalNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7116,11 +10297,17 @@ CUresult cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecExternalSemaphoresWaitNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7129,11 +10316,17 @@ CUresult cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&isEnabled, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphNodeSetEnabled) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7142,11 +10335,17 @@ CUresult cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsign
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&isEnabled, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int* isEnabled)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphNodeGetEnabled) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7156,11 +10355,16 @@ CUresult cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsign
         rpc_read(0, isEnabled, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphUpload) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7168,11 +10372,15 @@ CUresult cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphLaunch) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7180,33 +10388,42 @@ CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecDestroy(CUgraphExec hGraphExec)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecDestroy) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphDestroy(CUgraph hGraph)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphDestroy) < 0 ||
         rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphExecUpdate_v2(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExecUpdateResultInfo* resultInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)resultInfo, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphExecUpdate_v2) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 ||
@@ -7216,11 +10433,16 @@ CUresult cuGraphExecUpdate_v2(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExe
         rpc_read(0, resultInfo, sizeof(CUgraphExecUpdateResultInfo)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)resultInfo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphKernelNodeCopyAttributes) < 0 ||
         rpc_write(0, &dst, sizeof(CUgraphNode)) < 0 ||
@@ -7228,11 +10450,16 @@ CUresult cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue* value_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphKernelNodeGetAttribute) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -7242,11 +10469,17 @@ CUresult cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID att
         rpc_read(0, value_out, sizeof(CUkernelNodeAttrValue)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue* value)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphKernelNodeSetAttribute) < 0 ||
         rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 ||
@@ -7255,11 +10488,17 @@ CUresult cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID att
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphDebugDotPrint(CUgraph hGraph, const char* path, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)path, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphDebugDotPrint) < 0 ||
         rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 ||
@@ -7268,11 +10507,16 @@ CUresult cuGraphDebugDotPrint(CUgraph hGraph, const char* path, unsigned int fla
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)path, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuUserObjectRetain(CUuserObject object, unsigned int count)
 {
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuUserObjectRetain) < 0 ||
         rpc_write(0, &object, sizeof(CUuserObject)) < 0 ||
@@ -7280,11 +10524,15 @@ CUresult cuUserObjectRetain(CUuserObject object, unsigned int count)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuUserObjectRelease(CUuserObject object, unsigned int count)
 {
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuUserObjectRelease) < 0 ||
         rpc_write(0, &object, sizeof(CUuserObject)) < 0 ||
@@ -7292,11 +10540,17 @@ CUresult cuUserObjectRelease(CUuserObject object, unsigned int count)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphRetainUserObject) < 0 ||
         rpc_write(0, &graph, sizeof(CUgraph)) < 0 ||
@@ -7306,11 +10560,18 @@ CUresult cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned in
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphReleaseUserObject) < 0 ||
         rpc_write(0, &graph, sizeof(CUgraph)) < 0 ||
@@ -7319,11 +10580,18 @@ CUresult cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned i
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize)
 {
+    maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuOccupancyMaxActiveBlocksPerMultiprocessor) < 0 ||
         rpc_write(0, numBlocks, sizeof(int)) < 0 ||
@@ -7334,11 +10602,20 @@ CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, CUfunction
         rpc_read(0, numBlocks, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) < 0 ||
         rpc_write(0, numBlocks, sizeof(int)) < 0 ||
@@ -7350,11 +10627,20 @@ CUresult cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, CU
         rpc_read(0, numBlocks, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, CUfunction func, int numBlocks, int blockSize)
 {
+    maybe_copy_unified_arg(0, (void*)dynamicSmemSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numBlocks, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuOccupancyAvailableDynamicSMemPerBlock) < 0 ||
         rpc_write(0, dynamicSmemSize, sizeof(size_t)) < 0 ||
@@ -7365,11 +10651,18 @@ CUresult cuOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, CUfunc
         rpc_read(0, dynamicSmemSize, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)dynamicSmemSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numBlocks, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuOccupancyMaxPotentialClusterSize(int* clusterSize, CUfunction func, const CUlaunchConfig* config)
 {
+    maybe_copy_unified_arg(0, (void*)clusterSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)config, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuOccupancyMaxPotentialClusterSize) < 0 ||
         rpc_write(0, clusterSize, sizeof(int)) < 0 ||
@@ -7379,11 +10672,17 @@ CUresult cuOccupancyMaxPotentialClusterSize(int* clusterSize, CUfunction func, c
         rpc_read(0, clusterSize, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)clusterSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)config, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuOccupancyMaxActiveClusters(int* numClusters, CUfunction func, const CUlaunchConfig* config)
 {
+    maybe_copy_unified_arg(0, (void*)numClusters, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)config, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuOccupancyMaxActiveClusters) < 0 ||
         rpc_write(0, numClusters, sizeof(int)) < 0 ||
@@ -7393,11 +10692,17 @@ CUresult cuOccupancyMaxActiveClusters(int* numClusters, CUfunction func, const C
         rpc_read(0, numClusters, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)numClusters, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)config, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetArray) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7406,11 +10711,17 @@ CUresult cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetMipmappedArray) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7419,11 +10730,18 @@ CUresult cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmapped
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetAddress_v2(size_t* ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes)
 {
+    maybe_copy_unified_arg(0, (void*)ByteOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bytes, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetAddress_v2) < 0 ||
         rpc_write(0, ByteOffset, sizeof(size_t)) < 0 ||
@@ -7434,11 +10752,19 @@ CUresult cuTexRefSetAddress_v2(size_t* ByteOffset, CUtexref hTexRef, CUdeviceptr
         rpc_read(0, ByteOffset, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)ByteOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetAddress2D_v3(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR* desc, CUdeviceptr dptr, size_t Pitch)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Pitch, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetAddress2D_v3) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7448,11 +10774,18 @@ CUresult cuTexRefSetAddress2D_v3(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR*
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Pitch, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fmt, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&NumPackedComponents, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetFormat) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7461,11 +10794,17 @@ CUresult cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedCo
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fmt, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&NumPackedComponents, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dim, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&am, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetAddressMode) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7474,11 +10813,16 @@ CUresult cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dim, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&am, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fm, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetFilterMode) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7486,11 +10830,15 @@ CUresult cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fm, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&fm, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetMipmapFilterMode) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7498,11 +10846,15 @@ CUresult cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&fm, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bias, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetMipmapLevelBias) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7510,11 +10862,16 @@ CUresult cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bias, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&minMipmapLevelClamp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&maxMipmapLevelClamp, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetMipmapLevelClamp) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7523,11 +10880,16 @@ CUresult cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&minMipmapLevelClamp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&maxMipmapLevelClamp, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&maxAniso, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetMaxAnisotropy) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7535,11 +10897,15 @@ CUresult cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&maxAniso, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetBorderColor(CUtexref hTexRef, float* pBorderColor)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pBorderColor, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetBorderColor) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7548,11 +10914,15 @@ CUresult cuTexRefSetBorderColor(CUtexref hTexRef, float* pBorderColor)
         rpc_read(0, pBorderColor, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pBorderColor, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefSetFlags) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
@@ -7560,11 +10930,15 @@ CUresult cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetAddress_v2(CUdeviceptr* pdptr, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetAddress_v2) < 0 ||
         rpc_write(0, pdptr, sizeof(CUdeviceptr)) < 0 ||
@@ -7573,11 +10947,15 @@ CUresult cuTexRefGetAddress_v2(CUdeviceptr* pdptr, CUtexref hTexRef)
         rpc_read(0, pdptr, sizeof(CUdeviceptr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetArray(CUarray* phArray, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)phArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetArray) < 0 ||
         rpc_write(0, phArray, sizeof(CUarray)) < 0 ||
@@ -7586,11 +10964,15 @@ CUresult cuTexRefGetArray(CUarray* phArray, CUtexref hTexRef)
         rpc_read(0, phArray, sizeof(CUarray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetMipmappedArray(CUmipmappedArray* phMipmappedArray, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)phMipmappedArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetMipmappedArray) < 0 ||
         rpc_write(0, phMipmappedArray, sizeof(CUmipmappedArray)) < 0 ||
@@ -7599,11 +10981,16 @@ CUresult cuTexRefGetMipmappedArray(CUmipmappedArray* phMipmappedArray, CUtexref
         rpc_read(0, phMipmappedArray, sizeof(CUmipmappedArray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phMipmappedArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetAddressMode(CUaddress_mode* pam, CUtexref hTexRef, int dim)
 {
+    maybe_copy_unified_arg(0, (void*)pam, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dim, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetAddressMode) < 0 ||
         rpc_write(0, pam, sizeof(CUaddress_mode)) < 0 ||
@@ -7613,11 +11000,16 @@ CUresult cuTexRefGetAddressMode(CUaddress_mode* pam, CUtexref hTexRef, int dim)
         rpc_read(0, pam, sizeof(CUaddress_mode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pam, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dim, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetFilterMode(CUfilter_mode* pfm, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pfm, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetFilterMode) < 0 ||
         rpc_write(0, pfm, sizeof(CUfilter_mode)) < 0 ||
@@ -7626,11 +11018,16 @@ CUresult cuTexRefGetFilterMode(CUfilter_mode* pfm, CUtexref hTexRef)
         rpc_read(0, pfm, sizeof(CUfilter_mode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pfm, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetFormat(CUarray_format* pFormat, int* pNumChannels, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pFormat, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNumChannels, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetFormat) < 0 ||
         rpc_write(0, pFormat, sizeof(CUarray_format)) < 0 ||
@@ -7641,11 +11038,16 @@ CUresult cuTexRefGetFormat(CUarray_format* pFormat, int* pNumChannels, CUtexref
         rpc_read(0, pNumChannels, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pFormat, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNumChannels, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetMipmapFilterMode(CUfilter_mode* pfm, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pfm, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetMipmapFilterMode) < 0 ||
         rpc_write(0, pfm, sizeof(CUfilter_mode)) < 0 ||
@@ -7654,11 +11056,15 @@ CUresult cuTexRefGetMipmapFilterMode(CUfilter_mode* pfm, CUtexref hTexRef)
         rpc_read(0, pfm, sizeof(CUfilter_mode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pfm, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetMipmapLevelBias(float* pbias, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pbias, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetMipmapLevelBias) < 0 ||
         rpc_write(0, pbias, sizeof(float)) < 0 ||
@@ -7667,11 +11073,16 @@ CUresult cuTexRefGetMipmapLevelBias(float* pbias, CUtexref hTexRef)
         rpc_read(0, pbias, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pbias, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pminMipmapLevelClamp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pmaxMipmapLevelClamp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetMipmapLevelClamp) < 0 ||
         rpc_write(0, pminMipmapLevelClamp, sizeof(float)) < 0 ||
@@ -7682,11 +11093,16 @@ CUresult cuTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMip
         rpc_read(0, pmaxMipmapLevelClamp, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pminMipmapLevelClamp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pmaxMipmapLevelClamp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetMaxAnisotropy(int* pmaxAniso, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pmaxAniso, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetMaxAnisotropy) < 0 ||
         rpc_write(0, pmaxAniso, sizeof(int)) < 0 ||
@@ -7695,11 +11111,15 @@ CUresult cuTexRefGetMaxAnisotropy(int* pmaxAniso, CUtexref hTexRef)
         rpc_read(0, pmaxAniso, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pmaxAniso, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetBorderColor(float* pBorderColor, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pBorderColor, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetBorderColor) < 0 ||
         rpc_write(0, pBorderColor, sizeof(float)) < 0 ||
@@ -7708,11 +11128,15 @@ CUresult cuTexRefGetBorderColor(float* pBorderColor, CUtexref hTexRef)
         rpc_read(0, pBorderColor, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pBorderColor, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefGetFlags(unsigned int* pFlags, CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pFlags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefGetFlags) < 0 ||
         rpc_write(0, pFlags, sizeof(unsigned int)) < 0 ||
@@ -7721,11 +11145,14 @@ CUresult cuTexRefGetFlags(unsigned int* pFlags, CUtexref hTexRef)
         rpc_read(0, pFlags, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pFlags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefCreate(CUtexref* pTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)pTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefCreate) < 0 ||
         rpc_write(0, pTexRef, sizeof(CUtexref)) < 0 ||
@@ -7733,22 +11160,28 @@ CUresult cuTexRefCreate(CUtexref* pTexRef)
         rpc_read(0, pTexRef, sizeof(CUtexref)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexRefDestroy(CUtexref hTexRef)
 {
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexRefDestroy) < 0 ||
         rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hSurfRef, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuSurfRefSetArray) < 0 ||
         rpc_write(0, &hSurfRef, sizeof(CUsurfref)) < 0 ||
@@ -7757,11 +11190,16 @@ CUresult cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flag
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&hSurfRef, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuSurfRefGetArray(CUarray* phArray, CUsurfref hSurfRef)
 {
+    maybe_copy_unified_arg(0, (void*)phArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hSurfRef, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuSurfRefGetArray) < 0 ||
         rpc_write(0, phArray, sizeof(CUarray)) < 0 ||
@@ -7770,11 +11208,17 @@ CUresult cuSurfRefGetArray(CUarray* phArray, CUsurfref hSurfRef)
         rpc_read(0, phArray, sizeof(CUarray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)phArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hSurfRef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexObjectCreate(CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc)
 {
+    maybe_copy_unified_arg(0, (void*)pTexObject, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexObjectCreate) < 0 ||
         rpc_write(0, pTexObject, sizeof(CUtexObject)) < 0 ||
@@ -7785,22 +11229,30 @@ CUresult cuTexObjectCreate(CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pR
         rpc_read(0, pTexObject, sizeof(CUtexObject)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pTexObject, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexObjectDestroy(CUtexObject texObject)
 {
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexObjectDestroy) < 0 ||
         rpc_write(0, &texObject, sizeof(CUtexObject)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUtexObject texObject)
 {
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexObjectGetResourceDesc) < 0 ||
         rpc_write(0, pResDesc, sizeof(CUDA_RESOURCE_DESC)) < 0 ||
@@ -7809,11 +11261,15 @@ CUresult cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUtexObject te
         rpc_read(0, pResDesc, sizeof(CUDA_RESOURCE_DESC)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC* pTexDesc, CUtexObject texObject)
 {
+    maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexObjectGetTextureDesc) < 0 ||
         rpc_write(0, pTexDesc, sizeof(CUDA_TEXTURE_DESC)) < 0 ||
@@ -7822,11 +11278,15 @@ CUresult cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC* pTexDesc, CUtexObject texO
         rpc_read(0, pTexDesc, sizeof(CUDA_TEXTURE_DESC)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC* pResViewDesc, CUtexObject texObject)
 {
+    maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuTexObjectGetResourceViewDesc) < 0 ||
         rpc_write(0, pResViewDesc, sizeof(CUDA_RESOURCE_VIEW_DESC)) < 0 ||
@@ -7835,11 +11295,15 @@ CUresult cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC* pResViewDesc, C
         rpc_read(0, pResViewDesc, sizeof(CUDA_RESOURCE_VIEW_DESC)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuSurfObjectCreate(CUsurfObject* pSurfObject, const CUDA_RESOURCE_DESC* pResDesc)
 {
+    maybe_copy_unified_arg(0, (void*)pSurfObject, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuSurfObjectCreate) < 0 ||
         rpc_write(0, pSurfObject, sizeof(CUsurfObject)) < 0 ||
@@ -7848,22 +11312,28 @@ CUresult cuSurfObjectCreate(CUsurfObject* pSurfObject, const CUDA_RESOURCE_DESC*
         rpc_read(0, pSurfObject, sizeof(CUsurfObject)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pSurfObject, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuSurfObjectDestroy(CUsurfObject surfObject)
 {
+    maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuSurfObjectDestroy) < 0 ||
         rpc_write(0, &surfObject, sizeof(CUsurfObject)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUsurfObject surfObject)
 {
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuSurfObjectGetResourceDesc) < 0 ||
         rpc_write(0, pResDesc, sizeof(CUDA_RESOURCE_DESC)) < 0 ||
@@ -7872,11 +11342,16 @@ CUresult cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUsurfObject
         rpc_read(0, pResDesc, sizeof(CUDA_RESOURCE_DESC)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceCanAccessPeer(int* canAccessPeer, CUdevice dev, CUdevice peerDev)
 {
+    maybe_copy_unified_arg(0, (void*)canAccessPeer, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&peerDev, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceCanAccessPeer) < 0 ||
         rpc_write(0, canAccessPeer, sizeof(int)) < 0 ||
@@ -7886,11 +11361,16 @@ CUresult cuDeviceCanAccessPeer(int* canAccessPeer, CUdevice dev, CUdevice peerDe
         rpc_read(0, canAccessPeer, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)canAccessPeer, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&peerDev, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags)
 {
+    maybe_copy_unified_arg(0, (void*)&peerContext, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxEnablePeerAccess) < 0 ||
         rpc_write(0, &peerContext, sizeof(CUcontext)) < 0 ||
@@ -7898,22 +11378,30 @@ CUresult cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&peerContext, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuCtxDisablePeerAccess(CUcontext peerContext)
 {
+    maybe_copy_unified_arg(0, (void*)&peerContext, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuCtxDisablePeerAccess) < 0 ||
         rpc_write(0, &peerContext, sizeof(CUcontext)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&peerContext, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice)
 {
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuDeviceGetP2PAttribute) < 0 ||
         rpc_write(0, value, sizeof(int)) < 0 ||
@@ -7924,22 +11412,32 @@ CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdev
         rpc_read(0, value, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphicsUnregisterResource(CUgraphicsResource resource)
 {
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphicsUnregisterResource) < 0 ||
         rpc_write(0, &resource, sizeof(CUgraphicsResource)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel)
 {
+    maybe_copy_unified_arg(0, (void*)pArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&arrayIndex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mipLevel, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphicsSubResourceGetMappedArray) < 0 ||
         rpc_write(0, pArray, sizeof(CUarray)) < 0 ||
@@ -7950,11 +11448,17 @@ CUresult cuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource
         rpc_read(0, pArray, sizeof(CUarray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&arrayIndex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mipLevel, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray* pMipmappedArray, CUgraphicsResource resource)
 {
+    maybe_copy_unified_arg(0, (void*)pMipmappedArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphicsResourceGetMappedMipmappedArray) < 0 ||
         rpc_write(0, pMipmappedArray, sizeof(CUmipmappedArray)) < 0 ||
@@ -7963,11 +11467,16 @@ CUresult cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray* pMipmappedA
         rpc_read(0, pMipmappedArray, sizeof(CUmipmappedArray)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pMipmappedArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphicsResourceGetMappedPointer_v2(CUdeviceptr* pDevPtr, size_t* pSize, CUgraphicsResource resource)
 {
+    maybe_copy_unified_arg(0, (void*)pDevPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphicsResourceGetMappedPointer_v2) < 0 ||
         rpc_write(0, pDevPtr, sizeof(CUdeviceptr)) < 0 ||
@@ -7978,11 +11487,16 @@ CUresult cuGraphicsResourceGetMappedPointer_v2(CUdeviceptr* pDevPtr, size_t* pSi
         rpc_read(0, pSize, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)pDevPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphicsResourceSetMapFlags_v2(CUgraphicsResource resource, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphicsResourceSetMapFlags_v2) < 0 ||
         rpc_write(0, &resource, sizeof(CUgraphicsResource)) < 0 ||
@@ -7990,11 +11504,16 @@ CUresult cuGraphicsResourceSetMapFlags_v2(CUgraphicsResource resource, unsigned
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphicsMapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphicsMapResources) < 0 ||
         rpc_write(0, &count, sizeof(unsigned int)) < 0 ||
@@ -8004,11 +11523,17 @@ CUresult cuGraphicsMapResources(unsigned int count, CUgraphicsResource* resource
         rpc_read(0, resources, sizeof(CUgraphicsResource)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 CUresult cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream)
 {
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     CUresult return_value;
     if (rpc_start_request(0, RPC_cuGraphicsUnmapResources) < 0 ||
         rpc_write(0, &count, sizeof(unsigned int)) < 0 ||
@@ -8018,6 +11543,9 @@ CUresult cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resour
         rpc_read(0, resources, sizeof(CUgraphicsResource)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDA_ERROR_DEVICE_UNAVAILABLE;
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -8043,6 +11571,8 @@ cudaError_t cudaDeviceSynchronize()
 
 cudaError_t cudaDeviceSetLimit(enum cudaLimit limit, size_t value)
 {
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceSetLimit) < 0 ||
         rpc_write(0, &limit, sizeof(enum cudaLimit)) < 0 ||
@@ -8050,11 +11580,15 @@ cudaError_t cudaDeviceSetLimit(enum cudaLimit limit, size_t value)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetLimit(size_t* pValue, enum cudaLimit limit)
 {
+    maybe_copy_unified_arg(0, (void*)pValue, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetLimit) < 0 ||
         rpc_write(0, pValue, sizeof(size_t)) < 0 ||
@@ -8063,11 +11597,16 @@ cudaError_t cudaDeviceGetLimit(size_t* pValue, enum cudaLimit limit)
         rpc_read(0, pValue, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pValue, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, const struct cudaChannelFormatDesc* fmtDesc, int device)
 {
+    maybe_copy_unified_arg(0, (void*)maxWidthInElements, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fmtDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetTexture1DLinearMaxWidth) < 0 ||
         rpc_write(0, maxWidthInElements, sizeof(size_t)) < 0 ||
@@ -8077,11 +11616,15 @@ cudaError_t cudaDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, con
         rpc_read(0, maxWidthInElements, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)maxWidthInElements, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fmtDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetCacheConfig(enum cudaFuncCache* pCacheConfig)
 {
+    maybe_copy_unified_arg(0, (void*)pCacheConfig, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetCacheConfig) < 0 ||
         rpc_write(0, pCacheConfig, sizeof(enum cudaFuncCache)) < 0 ||
@@ -8089,11 +11632,14 @@ cudaError_t cudaDeviceGetCacheConfig(enum cudaFuncCache* pCacheConfig)
         rpc_read(0, pCacheConfig, sizeof(enum cudaFuncCache)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pCacheConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority)
 {
+    maybe_copy_unified_arg(0, (void*)leastPriority, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)greatestPriority, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetStreamPriorityRange) < 0 ||
         rpc_write(0, leastPriority, sizeof(int)) < 0 ||
@@ -8103,22 +11649,27 @@ cudaError_t cudaDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPr
         rpc_read(0, greatestPriority, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)leastPriority, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)greatestPriority, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig)
 {
+    maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceSetCacheConfig) < 0 ||
         rpc_write(0, &cacheConfig, sizeof(enum cudaFuncCache)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig* pConfig)
 {
+    maybe_copy_unified_arg(0, (void*)pConfig, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetSharedMemConfig) < 0 ||
         rpc_write(0, pConfig, sizeof(enum cudaSharedMemConfig)) < 0 ||
@@ -8126,22 +11677,27 @@ cudaError_t cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig* pConfig)
         rpc_read(0, pConfig, sizeof(enum cudaSharedMemConfig)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config)
 {
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceSetSharedMemConfig) < 0 ||
         rpc_write(0, &config, sizeof(enum cudaSharedMemConfig)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetByPCIBusId(int* device, const char* pciBusId)
 {
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetByPCIBusId) < 0 ||
         rpc_write(0, device, sizeof(int)) < 0 ||
@@ -8150,11 +11706,16 @@ cudaError_t cudaDeviceGetByPCIBusId(int* device, const char* pciBusId)
         rpc_read(0, device, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetPCIBusId(char* pciBusId, int len, int device)
 {
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetPCIBusId) < 0 ||
         rpc_write(0, pciBusId, sizeof(char)) < 0 ||
@@ -8164,11 +11725,16 @@ cudaError_t cudaDeviceGetPCIBusId(char* pciBusId, int len, int device)
         rpc_read(0, pciBusId, sizeof(char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaIpcGetEventHandle(cudaIpcEventHandle_t* handle, cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaIpcGetEventHandle) < 0 ||
         rpc_write(0, handle, sizeof(cudaIpcEventHandle_t)) < 0 ||
@@ -8177,11 +11743,15 @@ cudaError_t cudaIpcGetEventHandle(cudaIpcEventHandle_t* handle, cudaEvent_t even
         rpc_read(0, handle, sizeof(cudaIpcEventHandle_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaIpcOpenEventHandle(cudaEvent_t* event, cudaIpcEventHandle_t handle)
 {
+    maybe_copy_unified_arg(0, (void*)event, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaIpcOpenEventHandle) < 0 ||
         rpc_write(0, event, sizeof(cudaEvent_t)) < 0 ||
@@ -8190,11 +11760,16 @@ cudaError_t cudaIpcOpenEventHandle(cudaEvent_t* event, cudaIpcEventHandle_t hand
         rpc_read(0, event, sizeof(cudaEvent_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)event, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaIpcOpenMemHandle(void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaIpcOpenMemHandle) < 0 ||
         rpc_write(0, devPtr, sizeof(void*)) < 0 ||
@@ -8204,11 +11779,16 @@ cudaError_t cudaIpcOpenMemHandle(void** devPtr, cudaIpcMemHandle_t handle, unsig
         rpc_read(0, devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWritesTarget target, enum cudaFlushGPUDirectRDMAWritesScope scope)
 {
+    maybe_copy_unified_arg(0, (void*)&target, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceFlushGPUDirectRDMAWrites) < 0 ||
         rpc_write(0, &target, sizeof(enum cudaFlushGPUDirectRDMAWritesTarget)) < 0 ||
@@ -8216,6 +11796,8 @@ cudaError_t cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWrites
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&target, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -8241,6 +11823,8 @@ cudaError_t cudaThreadSynchronize()
 
 cudaError_t cudaThreadSetLimit(enum cudaLimit limit, size_t value)
 {
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaThreadSetLimit) < 0 ||
         rpc_write(0, &limit, sizeof(enum cudaLimit)) < 0 ||
@@ -8248,11 +11832,15 @@ cudaError_t cudaThreadSetLimit(enum cudaLimit limit, size_t value)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaThreadGetLimit(size_t* pValue, enum cudaLimit limit)
 {
+    maybe_copy_unified_arg(0, (void*)pValue, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaThreadGetLimit) < 0 ||
         rpc_write(0, pValue, sizeof(size_t)) < 0 ||
@@ -8261,11 +11849,14 @@ cudaError_t cudaThreadGetLimit(size_t* pValue, enum cudaLimit limit)
         rpc_read(0, pValue, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pValue, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaThreadGetCacheConfig(enum cudaFuncCache* pCacheConfig)
 {
+    maybe_copy_unified_arg(0, (void*)pCacheConfig, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaThreadGetCacheConfig) < 0 ||
         rpc_write(0, pCacheConfig, sizeof(enum cudaFuncCache)) < 0 ||
@@ -8273,17 +11864,20 @@ cudaError_t cudaThreadGetCacheConfig(enum cudaFuncCache* pCacheConfig)
         rpc_read(0, pCacheConfig, sizeof(enum cudaFuncCache)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pCacheConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig)
 {
+    maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaThreadSetCacheConfig) < 0 ||
         rpc_write(0, &cacheConfig, sizeof(enum cudaFuncCache)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -8309,17 +11903,21 @@ cudaError_t cudaPeekAtLastError()
 
 cudaError_t cudaGetDeviceCount(int* count)
 {
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetDeviceCount) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, count, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetDeviceProperties_v2(struct cudaDeviceProp* prop, int device)
 {
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetDeviceProperties_v2) < 0 ||
         rpc_write(0, &device, sizeof(int)) < 0 ||
@@ -8327,11 +11925,16 @@ cudaError_t cudaGetDeviceProperties_v2(struct cudaDeviceProp* prop, int device)
         rpc_read(0, prop, sizeof(struct cudaDeviceProp)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetAttribute(int* value, enum cudaDeviceAttr attr, int device)
 {
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetAttribute) < 0 ||
         rpc_write(0, value, sizeof(int)) < 0 ||
@@ -8341,11 +11944,16 @@ cudaError_t cudaDeviceGetAttribute(int* value, enum cudaDeviceAttr attr, int dev
         rpc_read(0, value, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device)
 {
+    maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetDefaultMemPool) < 0 ||
         rpc_write(0, memPool, sizeof(cudaMemPool_t)) < 0 ||
@@ -8354,11 +11962,15 @@ cudaError_t cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device)
         rpc_read(0, memPool, sizeof(cudaMemPool_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceSetMemPool(int device, cudaMemPool_t memPool)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceSetMemPool) < 0 ||
         rpc_write(0, &device, sizeof(int)) < 0 ||
@@ -8366,11 +11978,15 @@ cudaError_t cudaDeviceSetMemPool(int device, cudaMemPool_t memPool)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetMemPool(cudaMemPool_t* memPool, int device)
 {
+    maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetMemPool) < 0 ||
         rpc_write(0, memPool, sizeof(cudaMemPool_t)) < 0 ||
@@ -8379,11 +11995,17 @@ cudaError_t cudaDeviceGetMemPool(cudaMemPool_t* memPool, int device)
         rpc_read(0, memPool, sizeof(cudaMemPool_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGetP2PAttribute(int* value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice)
 {
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGetP2PAttribute) < 0 ||
         rpc_write(0, value, sizeof(int)) < 0 ||
@@ -8394,11 +12016,17 @@ cudaError_t cudaDeviceGetP2PAttribute(int* value, enum cudaDeviceP2PAttr attr, i
         rpc_read(0, value, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaChooseDevice(int* device, const struct cudaDeviceProp* prop)
 {
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaChooseDevice) < 0 ||
         rpc_write(0, device, sizeof(int)) < 0 ||
@@ -8407,11 +12035,16 @@ cudaError_t cudaChooseDevice(int* device, const struct cudaDeviceProp* prop)
         rpc_read(0, device, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaInitDevice(int device, unsigned int deviceFlags, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&deviceFlags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaInitDevice) < 0 ||
         rpc_write(0, &device, sizeof(int)) < 0 ||
@@ -8420,22 +12053,28 @@ cudaError_t cudaInitDevice(int device, unsigned int deviceFlags, unsigned int fl
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&deviceFlags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaSetDevice(int device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaSetDevice) < 0 ||
         rpc_write(0, &device, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetDevice(int* device)
 {
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetDevice) < 0 ||
         rpc_write(0, device, sizeof(int)) < 0 ||
@@ -8443,11 +12082,14 @@ cudaError_t cudaGetDevice(int* device)
         rpc_read(0, device, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaSetValidDevices(int* device_arr, int len)
 {
+    maybe_copy_unified_arg(0, (void*)device_arr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaSetValidDevices) < 0 ||
         rpc_write(0, device_arr, sizeof(int)) < 0 ||
@@ -8456,22 +12098,27 @@ cudaError_t cudaSetValidDevices(int* device_arr, int len)
         rpc_read(0, device_arr, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)device_arr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaSetDeviceFlags(unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaSetDeviceFlags) < 0 ||
         rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetDeviceFlags(unsigned int* flags)
 {
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetDeviceFlags) < 0 ||
         rpc_write(0, flags, sizeof(unsigned int)) < 0 ||
@@ -8479,11 +12126,13 @@ cudaError_t cudaGetDeviceFlags(unsigned int* flags)
         rpc_read(0, flags, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamCreate(cudaStream_t* pStream)
 {
+    maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamCreate) < 0 ||
         rpc_write(0, pStream, sizeof(cudaStream_t)) < 0 ||
@@ -8491,11 +12140,14 @@ cudaError_t cudaStreamCreate(cudaStream_t* pStream)
         rpc_read(0, pStream, sizeof(cudaStream_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamCreateWithFlags(cudaStream_t* pStream, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamCreateWithFlags) < 0 ||
         rpc_write(0, pStream, sizeof(cudaStream_t)) < 0 ||
@@ -8504,11 +12156,16 @@ cudaError_t cudaStreamCreateWithFlags(cudaStream_t* pStream, unsigned int flags)
         rpc_read(0, pStream, sizeof(cudaStream_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamCreateWithPriority(cudaStream_t* pStream, unsigned int flags, int priority)
 {
+    maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&priority, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamCreateWithPriority) < 0 ||
         rpc_write(0, pStream, sizeof(cudaStream_t)) < 0 ||
@@ -8518,11 +12175,16 @@ cudaError_t cudaStreamCreateWithPriority(cudaStream_t* pStream, unsigned int fla
         rpc_read(0, pStream, sizeof(cudaStream_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&priority, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int* priority)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)priority, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamGetPriority) < 0 ||
         rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 ||
@@ -8531,11 +12193,15 @@ cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int* priority)
         rpc_read(0, priority, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)priority, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamGetFlags(cudaStream_t hStream, unsigned int* flags)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamGetFlags) < 0 ||
         rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 ||
@@ -8544,11 +12210,15 @@ cudaError_t cudaStreamGetFlags(cudaStream_t hStream, unsigned int* flags)
         rpc_read(0, flags, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamGetId(cudaStream_t hStream, unsigned long long* streamId)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamGetId) < 0 ||
         rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 ||
@@ -8557,6 +12227,8 @@ cudaError_t cudaStreamGetId(cudaStream_t hStream, unsigned long long* streamId)
         rpc_read(0, streamId, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -8572,6 +12244,8 @@ cudaError_t cudaCtxResetPersistingL2Cache()
 
 cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamCopyAttributes) < 0 ||
         rpc_write(0, &dst, sizeof(cudaStream_t)) < 0 ||
@@ -8579,11 +12253,16 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamGetAttribute(cudaStream_t hStream, cudaLaunchAttributeID attr, cudaLaunchAttributeValue* value_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamGetAttribute) < 0 ||
         rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 ||
@@ -8593,11 +12272,17 @@ cudaError_t cudaStreamGetAttribute(cudaStream_t hStream, cudaLaunchAttributeID a
         rpc_read(0, value_out, sizeof(cudaLaunchAttributeValue)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamSetAttribute(cudaStream_t hStream, cudaLaunchAttributeID attr, const cudaLaunchAttributeValue* value)
 {
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamSetAttribute) < 0 ||
         rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 ||
@@ -8606,22 +12291,30 @@ cudaError_t cudaStreamSetAttribute(cudaStream_t hStream, cudaLaunchAttributeID a
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamDestroy(cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamDestroy) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamWaitEvent) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
@@ -8630,33 +12323,42 @@ cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamSynchronize(cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamSynchronize) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamQuery(cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamQuery) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamBeginCapture) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
@@ -8664,11 +12366,14 @@ cudaError_t cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMo
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode* mode)
 {
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaThreadExchangeStreamCaptureMode) < 0 ||
         rpc_write(0, mode, sizeof(enum cudaStreamCaptureMode)) < 0 ||
@@ -8676,11 +12381,14 @@ cudaError_t cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode* mode
         rpc_read(0, mode, sizeof(enum cudaStreamCaptureMode)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamEndCapture) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
@@ -8689,11 +12397,15 @@ cudaError_t cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph)
         rpc_read(0, pGraph, sizeof(cudaGraph_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus* pCaptureStatus)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pCaptureStatus, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamIsCapturing) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
@@ -8702,11 +12414,21 @@ cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureSta
         rpc_read(0, pCaptureStatus, sizeof(enum cudaStreamCaptureStatus)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pCaptureStatus, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)captureStatus_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)id_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)graph_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numDependencies_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies_out, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(*numDependencies_out); i++)
+       maybe_copy_unified_arg(0, (void*)&dependencies_out[i], cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamGetCaptureInfo_v2) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
@@ -8718,11 +12440,25 @@ cudaError_t cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCapt
         rpc_read(0, dependencies_out, *numDependencies_out * sizeof(const cudaGraphNode_t*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)captureStatus_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)id_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)graph_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numDependencies_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies_out, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(*numDependencies_out); i++)
+       maybe_copy_unified_arg(0, (void*)&dependencies_out[i], cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(numDependencies); i++)
+       maybe_copy_unified_arg(0, (void*)&dependencies[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaStreamUpdateCaptureDependencies) < 0 ||
         rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 ||
@@ -8732,22 +12468,32 @@ cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNo
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(numDependencies); i++)
+       maybe_copy_unified_arg(0, (void*)&dependencies[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaEventCreate(cudaEvent_t* event)
 {
+    maybe_copy_unified_arg(0, (void*)event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaEventCreate) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, event, sizeof(cudaEvent_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaEventCreateWithFlags(cudaEvent_t* event, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)event, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaEventCreateWithFlags) < 0 ||
         rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
@@ -8755,11 +12501,15 @@ cudaError_t cudaEventCreateWithFlags(cudaEvent_t* event, unsigned int flags)
         rpc_read(0, event, sizeof(cudaEvent_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)event, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaEventRecord) < 0 ||
         rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 ||
@@ -8767,11 +12517,16 @@ cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaEventRecordWithFlags) < 0 ||
         rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 ||
@@ -8780,44 +12535,56 @@ cudaError_t cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, uns
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaEventQuery(cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaEventQuery) < 0 ||
         rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaEventSynchronize(cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaEventSynchronize) < 0 ||
         rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaEventDestroy(cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaEventDestroy) < 0 ||
         rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t end)
 {
+    maybe_copy_unified_arg(0, (void*)ms, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&start, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&end, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaEventElapsedTime) < 0 ||
         rpc_write(0, &start, sizeof(cudaEvent_t)) < 0 ||
@@ -8826,11 +12593,17 @@ cudaError_t cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t end)
         rpc_read(0, ms, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)ms, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&start, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&end, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaExternalMemoryGetMappedBuffer(void** devPtr, cudaExternalMemory_t extMem, const struct cudaExternalMemoryBufferDesc* bufferDesc)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)bufferDesc, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaExternalMemoryGetMappedBuffer) < 0 ||
         rpc_write(0, devPtr, sizeof(void*)) < 0 ||
@@ -8840,11 +12613,17 @@ cudaError_t cudaExternalMemoryGetMappedBuffer(void** devPtr, cudaExternalMemory_
         rpc_read(0, devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)bufferDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t* mipmap, cudaExternalMemory_t extMem, const struct cudaExternalMemoryMipmappedArrayDesc* mipmapDesc)
 {
+    maybe_copy_unified_arg(0, (void*)mipmap, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mipmapDesc, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaExternalMemoryGetMappedMipmappedArray) < 0 ||
         rpc_write(0, mipmap, sizeof(cudaMipmappedArray_t)) < 0 ||
@@ -8854,22 +12633,29 @@ cudaError_t cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t* mipm
         rpc_read(0, mipmap, sizeof(cudaMipmappedArray_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)mipmap, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mipmapDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDestroyExternalMemory(cudaExternalMemory_t extMem)
 {
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDestroyExternalMemory) < 0 ||
         rpc_write(0, &extMem, sizeof(cudaExternalMemory_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out, const struct cudaExternalSemaphoreHandleDesc* semHandleDesc)
 {
+    maybe_copy_unified_arg(0, (void*)extSem_out, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)semHandleDesc, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaImportExternalSemaphore) < 0 ||
         rpc_write(0, extSem_out, sizeof(cudaExternalSemaphore_t)) < 0 ||
@@ -8878,11 +12664,17 @@ cudaError_t cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out, con
         rpc_read(0, extSem_out, sizeof(cudaExternalSemaphore_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)extSem_out, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)semHandleDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const struct cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaSignalExternalSemaphoresAsync_v2) < 0 ||
         rpc_write(0, &extSemArray, sizeof(const cudaExternalSemaphore_t*)) < 0 ||
@@ -8892,11 +12684,19 @@ cudaError_t cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t*
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const struct cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaWaitExternalSemaphoresAsync_v2) < 0 ||
         rpc_write(0, &extSemArray, sizeof(const cudaExternalSemaphore_t*)) < 0 ||
@@ -8906,22 +12706,31 @@ cudaError_t cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* ex
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem)
 {
+    maybe_copy_unified_arg(0, (void*)&extSem, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDestroyExternalSemaphore) < 0 ||
         rpc_write(0, &extSem, sizeof(cudaExternalSemaphore_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&extSem, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaLaunchKernelExC(const cudaLaunchConfig_t* config, const void* func, void** args)
 {
+    maybe_copy_unified_arg(0, (void*)config, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)args, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaLaunchKernelExC) < 0 ||
         rpc_write(0, &config, sizeof(const cudaLaunchConfig_t*)) < 0 ||
@@ -8931,11 +12740,20 @@ cudaError_t cudaLaunchKernelExC(const cudaLaunchConfig_t* config, const void* fu
         rpc_read(0, args, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)config, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)args, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaLaunchCooperativeKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&gridDim, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockDim, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)args, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&sharedMem, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaLaunchCooperativeKernel) < 0 ||
         rpc_write(0, &func, sizeof(const void*)) < 0 ||
@@ -8948,11 +12766,20 @@ cudaError_t cudaLaunchCooperativeKernel(const void* func, dim3 gridDim, dim3 blo
         rpc_read(0, args, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&gridDim, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockDim, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)args, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&sharedMem, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams* launchParamsList, unsigned int numDevices, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)launchParamsList, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDevices, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaLaunchCooperativeKernelMultiDevice) < 0 ||
         rpc_write(0, launchParamsList, sizeof(struct cudaLaunchParams)) < 0 ||
@@ -8962,11 +12789,16 @@ cudaError_t cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams* laun
         rpc_read(0, launchParamsList, sizeof(struct cudaLaunchParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)launchParamsList, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDevices, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaFuncSetCacheConfig(const void* func, enum cudaFuncCache cacheConfig)
 {
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaFuncSetCacheConfig) < 0 ||
         rpc_write(0, &func, sizeof(const void*)) < 0 ||
@@ -8974,11 +12806,15 @@ cudaError_t cudaFuncSetCacheConfig(const void* func, enum cudaFuncCache cacheCon
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaFuncSetSharedMemConfig(const void* func, enum cudaSharedMemConfig config)
 {
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaFuncSetSharedMemConfig) < 0 ||
         rpc_write(0, &func, sizeof(const void*)) < 0 ||
@@ -8986,11 +12822,15 @@ cudaError_t cudaFuncSetSharedMemConfig(const void* func, enum cudaSharedMemConfi
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes* attr, const void* func)
 {
+    maybe_copy_unified_arg(0, (void*)attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaFuncGetAttributes) < 0 ||
         rpc_write(0, attr, sizeof(struct cudaFuncAttributes)) < 0 ||
@@ -8999,11 +12839,16 @@ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes* attr, const void* f
         rpc_read(0, attr, sizeof(struct cudaFuncAttributes)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaFuncSetAttribute(const void* func, enum cudaFuncAttribute attr, int value)
 {
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaFuncSetAttribute) < 0 ||
         rpc_write(0, &func, sizeof(const void*)) < 0 ||
@@ -9012,11 +12857,15 @@ cudaError_t cudaFuncSetAttribute(const void* func, enum cudaFuncAttribute attr,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaSetDoubleForDevice(double* d)
 {
+    maybe_copy_unified_arg(0, (void*)d, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaSetDoubleForDevice) < 0 ||
         rpc_write(0, d, sizeof(double)) < 0 ||
@@ -9024,11 +12873,13 @@ cudaError_t cudaSetDoubleForDevice(double* d)
         rpc_read(0, d, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)d, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaSetDoubleForHost(double* d)
 {
+    maybe_copy_unified_arg(0, (void*)d, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaSetDoubleForHost) < 0 ||
         rpc_write(0, d, sizeof(double)) < 0 ||
@@ -9036,11 +12887,16 @@ cudaError_t cudaSetDoubleForHost(double* d)
         rpc_read(0, d, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)d, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, const void* func, int blockSize, size_t dynamicSMemSize)
 {
+    maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaOccupancyMaxActiveBlocksPerMultiprocessor) < 0 ||
         rpc_write(0, numBlocks, sizeof(int)) < 0 ||
@@ -9051,11 +12907,19 @@ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, const
         rpc_read(0, numBlocks, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, const void* func, int numBlocks, int blockSize)
 {
+    maybe_copy_unified_arg(0, (void*)dynamicSmemSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numBlocks, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaOccupancyAvailableDynamicSMemPerBlock) < 0 ||
         rpc_write(0, dynamicSmemSize, sizeof(size_t)) < 0 ||
@@ -9066,11 +12930,20 @@ cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, c
         rpc_read(0, dynamicSmemSize, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)dynamicSmemSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numBlocks, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, const void* func, int blockSize, size_t dynamicSMemSize, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) < 0 ||
         rpc_write(0, numBlocks, sizeof(int)) < 0 ||
@@ -9082,11 +12955,19 @@ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlock
         rpc_read(0, numBlocks, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaOccupancyMaxPotentialClusterSize(int* clusterSize, const void* func, const cudaLaunchConfig_t* launchConfig)
 {
+    maybe_copy_unified_arg(0, (void*)clusterSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)launchConfig, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaOccupancyMaxPotentialClusterSize) < 0 ||
         rpc_write(0, clusterSize, sizeof(int)) < 0 ||
@@ -9096,11 +12977,17 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int* clusterSize, const void* f
         rpc_read(0, clusterSize, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)clusterSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)launchConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, const cudaLaunchConfig_t* launchConfig)
 {
+    maybe_copy_unified_arg(0, (void*)numClusters, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)launchConfig, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaOccupancyMaxActiveClusters) < 0 ||
         rpc_write(0, numClusters, sizeof(int)) < 0 ||
@@ -9110,11 +12997,16 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
         rpc_read(0, numClusters, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)numClusters, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)launchConfig, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMalloc(void** devPtr, size_t size)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMalloc) < 0 ||
         rpc_write(0, &size, sizeof(size_t)) < 0 ||
@@ -9122,11 +13014,15 @@ cudaError_t cudaMalloc(void** devPtr, size_t size)
         rpc_read(0, devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMallocHost(void** ptr, size_t size)
 {
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMallocHost) < 0 ||
         rpc_write(0, ptr, sizeof(void*)) < 0 ||
@@ -9135,11 +13031,17 @@ cudaError_t cudaMallocHost(void** ptr, size_t size)
         rpc_read(0, ptr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMallocPitch) < 0 ||
         rpc_write(0, devPtr, sizeof(void*)) < 0 ||
@@ -9151,11 +13053,20 @@ cudaError_t cudaMallocPitch(void** devPtr, size_t* pitch, size_t width, size_t h
         rpc_read(0, pitch, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDesc* desc, size_t width, size_t height, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)array, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMallocArray) < 0 ||
         rpc_write(0, array, sizeof(cudaArray_t)) < 0 ||
@@ -9167,44 +13078,58 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
         rpc_read(0, array, sizeof(cudaArray_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)array, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaFreeHost(void* ptr)
 {
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaFreeHost) < 0 ||
         rpc_write(0, &ptr, sizeof(void*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaFreeArray(cudaArray_t array)
 {
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaFreeArray) < 0 ||
         rpc_write(0, &array, sizeof(cudaArray_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray)
 {
+    maybe_copy_unified_arg(0, (void*)&mipmappedArray, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaFreeMipmappedArray) < 0 ||
         rpc_write(0, &mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&mipmappedArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)pHost, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaHostAlloc) < 0 ||
         rpc_write(0, pHost, sizeof(void*)) < 0 ||
@@ -9214,11 +13139,16 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
         rpc_read(0, pHost, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pHost, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent)
 {
+    maybe_copy_unified_arg(0, (void*)pitchedDevPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMalloc3D) < 0 ||
         rpc_write(0, pitchedDevPtr, sizeof(struct cudaPitchedPtr)) < 0 ||
@@ -9227,11 +13157,17 @@ cudaError_t cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent
         rpc_read(0, pitchedDevPtr, sizeof(struct cudaPitchedPtr)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pitchedDevPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMalloc3DArray(cudaArray_t* array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)array, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMalloc3DArray) < 0 ||
         rpc_write(0, array, sizeof(cudaArray_t)) < 0 ||
@@ -9242,11 +13178,20 @@ cudaError_t cudaMalloc3DArray(cudaArray_t* array, const struct cudaChannelFormat
         rpc_read(0, array, sizeof(cudaArray_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)array, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMallocMipmappedArray(cudaMipmappedArray_t* mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)mipmappedArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numLevels, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMallocMipmappedArray) < 0 ||
         rpc_write(0, mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 ||
@@ -9258,11 +13203,19 @@ cudaError_t cudaMallocMipmappedArray(cudaMipmappedArray_t* mipmappedArray, const
         rpc_read(0, mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)mipmappedArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numLevels, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetMipmappedArrayLevel(cudaArray_t* levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level)
 {
+    maybe_copy_unified_arg(0, (void*)levelArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mipmappedArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetMipmappedArrayLevel) < 0 ||
         rpc_write(0, levelArray, sizeof(cudaArray_t)) < 0 ||
@@ -9272,33 +13225,42 @@ cudaError_t cudaGetMipmappedArrayLevel(cudaArray_t* levelArray, cudaMipmappedArr
         rpc_read(0, levelArray, sizeof(cudaArray_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)levelArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mipmappedArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpy3D(const struct cudaMemcpy3DParms* p)
 {
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpy3D) < 0 ||
         rpc_write(0, &p, sizeof(const struct cudaMemcpy3DParms*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms* p)
 {
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpy3DPeer) < 0 ||
         rpc_write(0, &p, sizeof(const struct cudaMemcpy3DPeerParms*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpy3DAsync(const struct cudaMemcpy3DParms* p, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpy3DAsync) < 0 ||
         rpc_write(0, &p, sizeof(const struct cudaMemcpy3DParms*)) < 0 ||
@@ -9306,11 +13268,15 @@ cudaError_t cudaMemcpy3DAsync(const struct cudaMemcpy3DParms* p, cudaStream_t st
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms* p, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpy3DPeerAsync) < 0 ||
         rpc_write(0, &p, sizeof(const struct cudaMemcpy3DPeerParms*)) < 0 ||
@@ -9318,11 +13284,15 @@ cudaError_t cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms* p, cudaStr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemGetInfo(size_t* free, size_t* total)
 {
+    maybe_copy_unified_arg(0, (void*)free, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)total, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemGetInfo) < 0 ||
         rpc_write(0, free, sizeof(size_t)) < 0 ||
@@ -9332,11 +13302,17 @@ cudaError_t cudaMemGetInfo(size_t* free, size_t* total)
         rpc_read(0, total, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)free, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)total, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaArrayGetInfo(struct cudaChannelFormatDesc* desc, struct cudaExtent* extent, unsigned int* flags, cudaArray_t array)
 {
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)extent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaArrayGetInfo) < 0 ||
         rpc_write(0, desc, sizeof(struct cudaChannelFormatDesc)) < 0 ||
@@ -9349,11 +13325,18 @@ cudaError_t cudaArrayGetInfo(struct cudaChannelFormatDesc* desc, struct cudaExte
         rpc_read(0, flags, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)extent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaArrayGetPlane(cudaArray_t* pPlaneArray, cudaArray_t hArray, unsigned int planeIdx)
 {
+    maybe_copy_unified_arg(0, (void*)pPlaneArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&planeIdx, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaArrayGetPlane) < 0 ||
         rpc_write(0, pPlaneArray, sizeof(cudaArray_t)) < 0 ||
@@ -9363,11 +13346,17 @@ cudaError_t cudaArrayGetPlane(cudaArray_t* pPlaneArray, cudaArray_t hArray, unsi
         rpc_read(0, pPlaneArray, sizeof(cudaArray_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pPlaneArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&planeIdx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements* memoryRequirements, cudaArray_t array, int device)
 {
+    maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaArrayGetMemoryRequirements) < 0 ||
         rpc_write(0, memoryRequirements, sizeof(struct cudaArrayMemoryRequirements)) < 0 ||
@@ -9377,11 +13366,17 @@ cudaError_t cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements* m
         rpc_read(0, memoryRequirements, sizeof(struct cudaArrayMemoryRequirements)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements* memoryRequirements, cudaMipmappedArray_t mipmap, int device)
 {
+    maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMipmappedArrayGetMemoryRequirements) < 0 ||
         rpc_write(0, memoryRequirements, sizeof(struct cudaArrayMemoryRequirements)) < 0 ||
@@ -9391,11 +13386,16 @@ cudaError_t cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequir
         rpc_read(0, memoryRequirements, sizeof(struct cudaArrayMemoryRequirements)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaArrayGetSparseProperties(struct cudaArraySparseProperties* sparseProperties, cudaArray_t array)
 {
+    maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaArrayGetSparseProperties) < 0 ||
         rpc_write(0, sparseProperties, sizeof(struct cudaArraySparseProperties)) < 0 ||
@@ -9404,11 +13404,15 @@ cudaError_t cudaArrayGetSparseProperties(struct cudaArraySparseProperties* spars
         rpc_read(0, sparseProperties, sizeof(struct cudaArraySparseProperties)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperties* sparseProperties, cudaMipmappedArray_t mipmap)
 {
+    maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMipmappedArrayGetSparseProperties) < 0 ||
         rpc_write(0, sparseProperties, sizeof(struct cudaArraySparseProperties)) < 0 ||
@@ -9417,11 +13421,21 @@ cudaError_t cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperti
         rpc_read(0, sparseProperties, sizeof(struct cudaArraySparseProperties)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&spitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpy2DToArray) < 0 ||
         rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 ||
@@ -9435,11 +13449,28 @@ cudaError_t cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&spitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wOffsetDst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOffsetDst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wOffsetSrc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOffsetSrc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpy2DArrayToArray) < 0 ||
         rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 ||
@@ -9454,11 +13485,25 @@ cudaError_t cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wOffsetDst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOffsetDst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wOffsetSrc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOffsetSrc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpyToSymbol(const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind)
 {
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpyToSymbol) < 0 ||
         rpc_write(0, &symbol, sizeof(const void*)) < 0 ||
@@ -9469,11 +13514,25 @@ cudaError_t cudaMemcpyToSymbol(const void* symbol, const void* src, size_t count
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&spitch, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpy2DToArrayAsync) < 0 ||
         rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 ||
@@ -9488,11 +13547,26 @@ cudaError_t cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOf
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&spitch, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpyToSymbolAsync(const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpyToSymbolAsync) < 0 ||
         rpc_write(0, &symbol, sizeof(const void*)) < 0 ||
@@ -9504,11 +13578,20 @@ cudaError_t cudaMemcpyToSymbolAsync(const void* symbol, const void* src, size_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent)
 {
+    maybe_copy_unified_arg(0, (void*)&pitchedDevPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemset3D) < 0 ||
         rpc_write(0, &pitchedDevPtr, sizeof(struct cudaPitchedPtr)) < 0 ||
@@ -9517,11 +13600,18 @@ cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&pitchedDevPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&pitchedDevPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemset3DAsync) < 0 ||
         rpc_write(0, &pitchedDevPtr, sizeof(struct cudaPitchedPtr)) < 0 ||
@@ -9531,11 +13621,17 @@ cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, st
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&pitchedDevPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetSymbolAddress(void** devPtr, const void* symbol)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetSymbolAddress) < 0 ||
         rpc_write(0, devPtr, sizeof(void*)) < 0 ||
@@ -9544,11 +13640,15 @@ cudaError_t cudaGetSymbolAddress(void** devPtr, const void* symbol)
         rpc_read(0, devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetSymbolSize(size_t* size, const void* symbol)
 {
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetSymbolSize) < 0 ||
         rpc_write(0, size, sizeof(size_t)) < 0 ||
@@ -9557,11 +13657,17 @@ cudaError_t cudaGetSymbolSize(size_t* size, const void* symbol)
         rpc_read(0, size, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemPrefetchAsync) < 0 ||
         rpc_write(0, &devPtr, sizeof(const void*)) < 0 ||
@@ -9571,11 +13677,19 @@ cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemAdvise(const void* devPtr, size_t count, enum cudaMemoryAdvise advice, int device)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&advice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemAdvise) < 0 ||
         rpc_write(0, &devPtr, sizeof(const void*)) < 0 ||
@@ -9585,11 +13699,21 @@ cudaError_t cudaMemAdvise(const void* devPtr, size_t count, enum cudaMemoryAdvis
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&advice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemRangeGetAttributes(void** data, size_t* dataSizes, enum cudaMemRangeAttribute* attributes, size_t numAttributes, const void* devPtr, size_t count)
 {
+    maybe_copy_unified_arg(0, (void*)data, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dataSizes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemRangeGetAttributes) < 0 ||
         rpc_write(0, data, sizeof(void*)) < 0 ||
@@ -9604,11 +13728,23 @@ cudaError_t cudaMemRangeGetAttributes(void** data, size_t* dataSizes, enum cudaM
         rpc_read(0, attributes, sizeof(enum cudaMemRangeAttribute)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)data, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dataSizes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t count, enum cudaMemcpyKind kind)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpyToArray) < 0 ||
         rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 ||
@@ -9620,11 +13756,25 @@ cudaError_t cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, c
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wOffsetDst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOffsetDst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wOffsetSrc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOffsetSrc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpyArrayToArray) < 0 ||
         rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 ||
@@ -9638,11 +13788,26 @@ cudaError_t cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hO
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wOffsetDst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOffsetDst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wOffsetSrc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOffsetSrc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemcpyToArrayAsync) < 0 ||
         rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 ||
@@ -9655,11 +13820,21 @@ cudaError_t cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffs
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMallocAsync(void** devPtr, size_t size, cudaStream_t hStream)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMallocAsync) < 0 ||
         rpc_write(0, devPtr, sizeof(void*)) < 0 ||
@@ -9669,11 +13844,16 @@ cudaError_t cudaMallocAsync(void** devPtr, size_t size, cudaStream_t hStream)
         rpc_read(0, devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep)
 {
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&minBytesToKeep, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemPoolTrimTo) < 0 ||
         rpc_write(0, &memPool, sizeof(cudaMemPool_t)) < 0 ||
@@ -9681,11 +13861,16 @@ cudaError_t cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&minBytesToKeep, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAccessDesc* descList, size_t count)
 {
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)descList, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemPoolSetAccess) < 0 ||
         rpc_write(0, &memPool, sizeof(cudaMemPool_t)) < 0 ||
@@ -9694,11 +13879,17 @@ cudaError_t cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAcce
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)descList, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemPoolGetAccess(enum cudaMemAccessFlags* flags, cudaMemPool_t memPool, struct cudaMemLocation* location)
 {
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)location, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemPoolGetAccess) < 0 ||
         rpc_write(0, flags, sizeof(enum cudaMemAccessFlags)) < 0 ||
@@ -9709,11 +13900,16 @@ cudaError_t cudaMemPoolGetAccess(enum cudaMemAccessFlags* flags, cudaMemPool_t m
         rpc_read(0, location, sizeof(struct cudaMemLocation)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)location, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemPoolCreate(cudaMemPool_t* memPool, const struct cudaMemPoolProps* poolProps)
 {
+    maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)poolProps, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemPoolCreate) < 0 ||
         rpc_write(0, memPool, sizeof(cudaMemPool_t)) < 0 ||
@@ -9722,22 +13918,30 @@ cudaError_t cudaMemPoolCreate(cudaMemPool_t* memPool, const struct cudaMemPoolPr
         rpc_read(0, memPool, sizeof(cudaMemPool_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)poolProps, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemPoolDestroy(cudaMemPool_t memPool)
 {
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemPoolDestroy) < 0 ||
         rpc_write(0, &memPool, sizeof(cudaMemPool_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMallocFromPoolAsync) < 0 ||
         rpc_write(0, ptr, sizeof(void*)) < 0 ||
@@ -9748,11 +13952,18 @@ cudaError_t cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPo
         rpc_read(0, ptr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaMemPoolImportPointer(void** ptr, cudaMemPool_t memPool, struct cudaMemPoolPtrExportData* exportData)
 {
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)exportData, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMemPoolImportPointer) < 0 ||
         rpc_write(0, ptr, sizeof(void*)) < 0 ||
@@ -9763,11 +13974,16 @@ cudaError_t cudaMemPoolImportPointer(void** ptr, cudaMemPool_t memPool, struct c
         rpc_read(0, exportData, sizeof(struct cudaMemPoolPtrExportData)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)exportData, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaPointerGetAttributes(struct cudaPointerAttributes* attributes, const void* ptr)
 {
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaPointerGetAttributes) < 0 ||
         rpc_write(0, attributes, sizeof(struct cudaPointerAttributes)) < 0 ||
@@ -9776,11 +13992,16 @@ cudaError_t cudaPointerGetAttributes(struct cudaPointerAttributes* attributes, c
         rpc_read(0, attributes, sizeof(struct cudaPointerAttributes)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevice)
 {
+    maybe_copy_unified_arg(0, (void*)canAccessPeer, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceCanAccessPeer) < 0 ||
         rpc_write(0, canAccessPeer, sizeof(int)) < 0 ||
@@ -9790,11 +14011,16 @@ cudaError_t cudaDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevi
         rpc_read(0, canAccessPeer, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)canAccessPeer, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceEnablePeerAccess) < 0 ||
         rpc_write(0, &peerDevice, sizeof(int)) < 0 ||
@@ -9802,33 +14028,41 @@ cudaError_t cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceDisablePeerAccess(int peerDevice)
 {
+    maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceDisablePeerAccess) < 0 ||
         rpc_write(0, &peerDevice, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource)
 {
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphicsUnregisterResource) < 0 ||
         rpc_write(0, &resource, sizeof(cudaGraphicsResource_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphicsResourceSetMapFlags) < 0 ||
         rpc_write(0, &resource, sizeof(cudaGraphicsResource_t)) < 0 ||
@@ -9836,11 +14070,16 @@ cudaError_t cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, uns
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphicsMapResources(int count, cudaGraphicsResource_t* resources, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphicsMapResources) < 0 ||
         rpc_write(0, &count, sizeof(int)) < 0 ||
@@ -9850,11 +14089,17 @@ cudaError_t cudaGraphicsMapResources(int count, cudaGraphicsResource_t* resource
         rpc_read(0, resources, sizeof(cudaGraphicsResource_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t* resources, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphicsUnmapResources) < 0 ||
         rpc_write(0, &count, sizeof(int)) < 0 ||
@@ -9864,11 +14109,17 @@ cudaError_t cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t* resour
         rpc_read(0, resources, sizeof(cudaGraphicsResource_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphicsResourceGetMappedPointer(void** devPtr, size_t* size, cudaGraphicsResource_t resource)
 {
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphicsResourceGetMappedPointer) < 0 ||
         rpc_write(0, devPtr, sizeof(void*)) < 0 ||
@@ -9879,11 +14130,18 @@ cudaError_t cudaGraphicsResourceGetMappedPointer(void** devPtr, size_t* size, cu
         rpc_read(0, size, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphicsSubResourceGetMappedArray(cudaArray_t* array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel)
 {
+    maybe_copy_unified_arg(0, (void*)array, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&arrayIndex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mipLevel, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphicsSubResourceGetMappedArray) < 0 ||
         rpc_write(0, array, sizeof(cudaArray_t)) < 0 ||
@@ -9894,11 +14152,17 @@ cudaError_t cudaGraphicsSubResourceGetMappedArray(cudaArray_t* array, cudaGraphi
         rpc_read(0, array, sizeof(cudaArray_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)array, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&arrayIndex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mipLevel, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t* mipmappedArray, cudaGraphicsResource_t resource)
 {
+    maybe_copy_unified_arg(0, (void*)mipmappedArray, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphicsResourceGetMappedMipmappedArray) < 0 ||
         rpc_write(0, mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 ||
@@ -9907,11 +14171,15 @@ cudaError_t cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t* mi
         rpc_read(0, mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)mipmappedArray, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetChannelDesc(struct cudaChannelFormatDesc* desc, cudaArray_const_t array)
 {
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetChannelDesc) < 0 ||
         rpc_write(0, desc, sizeof(struct cudaChannelFormatDesc)) < 0 ||
@@ -9920,11 +14188,17 @@ cudaError_t cudaGetChannelDesc(struct cudaChannelFormatDesc* desc, cudaArray_con
         rpc_read(0, desc, sizeof(struct cudaChannelFormatDesc)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaCreateTextureObject(cudaTextureObject_t* pTexObject, const struct cudaResourceDesc* pResDesc, const struct cudaTextureDesc* pTexDesc, const struct cudaResourceViewDesc* pResViewDesc)
 {
+    maybe_copy_unified_arg(0, (void*)pTexObject, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaCreateTextureObject) < 0 ||
         rpc_write(0, pTexObject, sizeof(cudaTextureObject_t)) < 0 ||
@@ -9935,22 +14209,30 @@ cudaError_t cudaCreateTextureObject(cudaTextureObject_t* pTexObject, const struc
         rpc_read(0, pTexObject, sizeof(cudaTextureObject_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pTexObject, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDestroyTextureObject(cudaTextureObject_t texObject)
 {
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDestroyTextureObject) < 0 ||
         rpc_write(0, &texObject, sizeof(cudaTextureObject_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetTextureObjectResourceDesc(struct cudaResourceDesc* pResDesc, cudaTextureObject_t texObject)
 {
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetTextureObjectResourceDesc) < 0 ||
         rpc_write(0, pResDesc, sizeof(struct cudaResourceDesc)) < 0 ||
@@ -9959,11 +14241,15 @@ cudaError_t cudaGetTextureObjectResourceDesc(struct cudaResourceDesc* pResDesc,
         rpc_read(0, pResDesc, sizeof(struct cudaResourceDesc)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetTextureObjectTextureDesc(struct cudaTextureDesc* pTexDesc, cudaTextureObject_t texObject)
 {
+    maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetTextureObjectTextureDesc) < 0 ||
         rpc_write(0, pTexDesc, sizeof(struct cudaTextureDesc)) < 0 ||
@@ -9972,11 +14258,15 @@ cudaError_t cudaGetTextureObjectTextureDesc(struct cudaTextureDesc* pTexDesc, cu
         rpc_read(0, pTexDesc, sizeof(struct cudaTextureDesc)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc* pResViewDesc, cudaTextureObject_t texObject)
 {
+    maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetTextureObjectResourceViewDesc) < 0 ||
         rpc_write(0, pResViewDesc, sizeof(struct cudaResourceViewDesc)) < 0 ||
@@ -9985,11 +14275,15 @@ cudaError_t cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc* pR
         rpc_read(0, pResViewDesc, sizeof(struct cudaResourceViewDesc)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaCreateSurfaceObject(cudaSurfaceObject_t* pSurfObject, const struct cudaResourceDesc* pResDesc)
 {
+    maybe_copy_unified_arg(0, (void*)pSurfObject, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaCreateSurfaceObject) < 0 ||
         rpc_write(0, pSurfObject, sizeof(cudaSurfaceObject_t)) < 0 ||
@@ -9998,22 +14292,28 @@ cudaError_t cudaCreateSurfaceObject(cudaSurfaceObject_t* pSurfObject, const stru
         rpc_read(0, pSurfObject, sizeof(cudaSurfaceObject_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pSurfObject, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject)
 {
+    maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDestroySurfaceObject) < 0 ||
         rpc_write(0, &surfObject, sizeof(cudaSurfaceObject_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc* pResDesc, cudaSurfaceObject_t surfObject)
 {
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetSurfaceObjectResourceDesc) < 0 ||
         rpc_write(0, pResDesc, sizeof(struct cudaResourceDesc)) < 0 ||
@@ -10022,11 +14322,14 @@ cudaError_t cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc* pResDesc,
         rpc_read(0, pResDesc, sizeof(struct cudaResourceDesc)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDriverGetVersion(int* driverVersion)
 {
+    maybe_copy_unified_arg(0, (void*)driverVersion, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDriverGetVersion) < 0 ||
         rpc_write(0, driverVersion, sizeof(int)) < 0 ||
@@ -10034,11 +14337,13 @@ cudaError_t cudaDriverGetVersion(int* driverVersion)
         rpc_read(0, driverVersion, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)driverVersion, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaRuntimeGetVersion(int* runtimeVersion)
 {
+    maybe_copy_unified_arg(0, (void*)runtimeVersion, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaRuntimeGetVersion) < 0 ||
         rpc_write(0, runtimeVersion, sizeof(int)) < 0 ||
@@ -10046,11 +14351,14 @@ cudaError_t cudaRuntimeGetVersion(int* runtimeVersion)
         rpc_read(0, runtimeVersion, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)runtimeVersion, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphCreate) < 0 ||
         rpc_write(0, pGraph, sizeof(cudaGraph_t)) < 0 ||
@@ -10059,11 +14367,18 @@ cudaError_t cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags)
         rpc_read(0, pGraph, sizeof(cudaGraph_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddKernelNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaKernelNodeParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddKernelNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10075,11 +14390,18 @@ cudaError_t cudaGraphAddKernelNode(cudaGraphNode_t* pGraphNode, cudaGraph_t grap
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphKernelNodeGetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10088,11 +14410,15 @@ cudaError_t cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernel
         rpc_read(0, pNodeParams, sizeof(struct cudaKernelNodeParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphKernelNodeSetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10100,11 +14426,15 @@ cudaError_t cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cuda
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst)
 {
+    maybe_copy_unified_arg(0, (void*)&hSrc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hDst, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphKernelNodeCopyAttributes) < 0 ||
         rpc_write(0, &hSrc, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10112,11 +14442,16 @@ cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNod
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hSrc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hDst, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphKernelNodeGetAttribute(cudaGraphNode_t hNode, cudaLaunchAttributeID attr, cudaLaunchAttributeValue* value_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphKernelNodeGetAttribute) < 0 ||
         rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10126,11 +14461,17 @@ cudaError_t cudaGraphKernelNodeGetAttribute(cudaGraphNode_t hNode, cudaLaunchAtt
         rpc_read(0, value_out, sizeof(cudaLaunchAttributeValue)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphKernelNodeSetAttribute(cudaGraphNode_t hNode, cudaLaunchAttributeID attr, const cudaLaunchAttributeValue* value)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphKernelNodeSetAttribute) < 0 ||
         rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10139,11 +14480,19 @@ cudaError_t cudaGraphKernelNodeSetAttribute(cudaGraphNode_t hNode, cudaLaunchAtt
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddMemcpyNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms* pCopyParams)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pCopyParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddMemcpyNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10155,11 +14504,25 @@ cudaError_t cudaGraphAddMemcpyNode(cudaGraphNode_t* pGraphNode, cudaGraph_t grap
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pCopyParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddMemcpyNodeToSymbol(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddMemcpyNodeToSymbol) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10175,11 +14538,22 @@ cudaError_t cudaGraphAddMemcpyNodeToSymbol(cudaGraphNode_t* pGraphNode, cudaGrap
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphMemcpyNodeGetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10188,11 +14562,15 @@ cudaError_t cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy
         rpc_read(0, pNodeParams, sizeof(struct cudaMemcpy3DParms)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphMemcpyNodeSetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10200,11 +14578,19 @@ cudaError_t cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cuda
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(cudaGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphMemcpyNodeSetParamsToSymbol) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10216,11 +14602,22 @@ cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(cudaGraphNode_t node, const voi
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddMemsetNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaMemsetParams* pMemsetParams)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pMemsetParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddMemsetNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10232,11 +14629,18 @@ cudaError_t cudaGraphAddMemsetNode(cudaGraphNode_t* pGraphNode, cudaGraph_t grap
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pMemsetParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphMemsetNodeGetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10245,11 +14649,15 @@ cudaError_t cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemset
         rpc_read(0, pNodeParams, sizeof(struct cudaMemsetParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphMemsetNodeSetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10257,11 +14665,18 @@ cudaError_t cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cuda
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddHostNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaHostNodeParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddHostNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10273,11 +14688,18 @@ cudaError_t cudaGraphAddHostNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph,
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphHostNodeGetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10286,11 +14708,15 @@ cudaError_t cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNode
         rpc_read(0, pNodeParams, sizeof(struct cudaHostNodeParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphHostNodeSetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10298,11 +14724,18 @@ cudaError_t cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHo
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddChildGraphNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraph_t childGraph)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddChildGraphNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10314,11 +14747,18 @@ cudaError_t cudaGraphAddChildGraphNode(cudaGraphNode_t* pGraphNode, cudaGraph_t
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t* pGraph)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphChildGraphNodeGetGraph) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10327,11 +14767,17 @@ cudaError_t cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t* p
         rpc_read(0, pGraph, sizeof(cudaGraph_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddEmptyNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddEmptyNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10342,11 +14788,20 @@ cudaError_t cudaGraphAddEmptyNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddEventRecordNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddEventRecordNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10358,11 +14813,18 @@ cudaError_t cudaGraphAddEventRecordNode(cudaGraphNode_t* pGraphNode, cudaGraph_t
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t* event_out)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphEventRecordNodeGetEvent) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10371,11 +14833,15 @@ cudaError_t cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t*
         rpc_read(0, event_out, sizeof(cudaEvent_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphEventRecordNodeSetEvent) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10383,11 +14849,18 @@ cudaError_t cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t e
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddEventWaitNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddEventWaitNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10399,11 +14872,18 @@ cudaError_t cudaGraphAddEventWaitNode(cudaGraphNode_t* pGraphNode, cudaGraph_t g
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t* event_out)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphEventWaitNodeGetEvent) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10412,11 +14892,15 @@ cudaError_t cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t* ev
         rpc_read(0, event_out, sizeof(cudaEvent_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphEventWaitNodeSetEvent) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10424,11 +14908,18 @@ cudaError_t cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t eve
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreSignalNodeParams* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddExternalSemaphoresSignalNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10440,11 +14931,18 @@ cudaError_t cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t* pGraphNode
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreSignalNodeParams* params_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExternalSemaphoresSignalNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10453,11 +14951,15 @@ cudaError_t cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode
         rpc_read(0, params_out, sizeof(struct cudaExternalSemaphoreSignalNodeParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExternalSemaphoresSignalNodeSetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10465,11 +14967,18 @@ cudaError_t cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreWaitNodeParams* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddExternalSemaphoresWaitNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10481,11 +14990,18 @@ cudaError_t cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t* pGraphNode,
         rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreWaitNodeParams* params_out)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExternalSemaphoresWaitNodeGetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10494,11 +15010,15 @@ cudaError_t cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode,
         rpc_read(0, params_out, sizeof(struct cudaExternalSemaphoreWaitNodeParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExternalSemaphoresWaitNodeSetParams) < 0 ||
         rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10506,11 +15026,18 @@ cudaError_t cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddMemAllocNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, struct cudaMemAllocNodeParams* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddMemAllocNode) < 0 ||
         rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10523,11 +15050,18 @@ cudaError_t cudaGraphAddMemAllocNode(cudaGraphNode_t* pGraphNode, cudaGraph_t gr
         rpc_read(0, nodeParams, sizeof(struct cudaMemAllocNodeParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemAllocNodeParams* params_out)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphMemAllocNodeGetParams) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10536,22 +15070,28 @@ cudaError_t cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemA
         rpc_read(0, params_out, sizeof(struct cudaMemAllocNodeParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaDeviceGraphMemTrim(int device)
 {
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaDeviceGraphMemTrim) < 0 ||
         rpc_write(0, &device, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphClone(cudaGraph_t* pGraphClone, cudaGraph_t originalGraph)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphClone, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&originalGraph, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphClone) < 0 ||
         rpc_write(0, pGraphClone, sizeof(cudaGraph_t)) < 0 ||
@@ -10560,11 +15100,16 @@ cudaError_t cudaGraphClone(cudaGraph_t* pGraphClone, cudaGraph_t originalGraph)
         rpc_read(0, pGraphClone, sizeof(cudaGraph_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphClone, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&originalGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphNodeFindInClone(cudaGraphNode_t* pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph)
 {
+    maybe_copy_unified_arg(0, (void*)pNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&originalNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&clonedGraph, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphNodeFindInClone) < 0 ||
         rpc_write(0, pNode, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10574,11 +15119,16 @@ cudaError_t cudaGraphNodeFindInClone(cudaGraphNode_t* pNode, cudaGraphNode_t ori
         rpc_read(0, pNode, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&originalNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&clonedGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType* pType)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pType, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphNodeGetType) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10587,11 +15137,16 @@ cudaError_t cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType* p
         rpc_read(0, pType, sizeof(enum cudaGraphNodeType)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pType, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t* nodes, size_t* numNodes)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numNodes, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphGetNodes) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
@@ -10602,11 +15157,17 @@ cudaError_t cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t* nodes, size_t*
         rpc_read(0, numNodes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numNodes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRootNodes, size_t* pNumRootNodes)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pRootNodes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNumRootNodes, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphGetRootNodes) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
@@ -10617,11 +15178,18 @@ cudaError_t cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRootNodes
         rpc_read(0, pNumRootNodes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pRootNodes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNumRootNodes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from, cudaGraphNode_t* to, size_t* numEdges)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)numEdges, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphGetEdges) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
@@ -10634,11 +15202,18 @@ cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from, cudaGrap
         rpc_read(0, numEdges, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)numEdges, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNumDependencies, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphNodeGetDependencies) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10649,11 +15224,17 @@ cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t*
         rpc_read(0, pNumDependencies, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNumDependencies, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pDependentNodes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNumDependentNodes, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphNodeGetDependentNodes) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
@@ -10664,11 +15245,18 @@ cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t
         rpc_read(0, pNumDependentNodes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pDependentNodes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNumDependentNodes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, size_t numDependencies)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphAddDependencies) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
@@ -10678,11 +15266,19 @@ cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* f
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, size_t numDependencies)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphRemoveDependencies) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
@@ -10692,22 +15288,31 @@ cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphDestroyNode(cudaGraphNode_t node)
 {
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphDestroyNode) < 0 ||
         rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphInstantiate(cudaGraphExec_t* pGraphExec, cudaGraph_t graph, unsigned long long flags)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphInstantiate) < 0 ||
         rpc_write(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10717,11 +15322,17 @@ cudaError_t cudaGraphInstantiate(cudaGraphExec_t* pGraphExec, cudaGraph_t graph,
         rpc_read(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphInstantiateWithFlags(cudaGraphExec_t* pGraphExec, cudaGraph_t graph, unsigned long long flags)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphInstantiateWithFlags) < 0 ||
         rpc_write(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10731,11 +15342,17 @@ cudaError_t cudaGraphInstantiateWithFlags(cudaGraphExec_t* pGraphExec, cudaGraph
         rpc_read(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphInstantiateWithParams(cudaGraphExec_t* pGraphExec, cudaGraph_t graph, cudaGraphInstantiateParams* instantiateParams)
 {
+    maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)instantiateParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphInstantiateWithParams) < 0 ||
         rpc_write(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10746,11 +15363,16 @@ cudaError_t cudaGraphInstantiateWithParams(cudaGraphExec_t* pGraphExec, cudaGrap
         rpc_read(0, instantiateParams, sizeof(cudaGraphInstantiateParams)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)instantiateParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecGetFlags(cudaGraphExec_t graphExec, unsigned long long* flags)
 {
+    maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecGetFlags) < 0 ||
         rpc_write(0, &graphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10759,11 +15381,16 @@ cudaError_t cudaGraphExecGetFlags(cudaGraphExec_t graphExec, unsigned long long*
         rpc_read(0, flags, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecKernelNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10772,11 +15399,17 @@ cudaError_t cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGra
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecMemcpyNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10785,11 +15418,21 @@ cudaError_t cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGra
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecMemcpyNodeSetParamsToSymbol) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10802,11 +15445,21 @@ cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(cudaGraphExec_t hGraphExec,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecMemsetNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10815,11 +15468,17 @@ cudaError_t cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGra
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams* pNodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecHostNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10828,11 +15487,17 @@ cudaError_t cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraph
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecChildGraphNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10841,11 +15506,17 @@ cudaError_t cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cud
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecEventRecordNodeSetEvent) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10854,11 +15525,17 @@ cudaError_t cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cud
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecEventWaitNodeSetEvent) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10867,11 +15544,17 @@ cudaError_t cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaG
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecExternalSemaphoresSignalNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10880,11 +15563,17 @@ cudaError_t cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t h
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams* nodeParams)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecExternalSemaphoresWaitNodeSetParams) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10893,11 +15582,17 @@ cudaError_t cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&isEnabled, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphNodeSetEnabled) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10906,11 +15601,17 @@ cudaError_t cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&isEnabled, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int* isEnabled)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphNodeGetEnabled) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10920,11 +15621,17 @@ cudaError_t cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t
         rpc_read(0, isEnabled, sizeof(unsigned int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphExecUpdateResultInfo* resultInfo)
 {
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)resultInfo, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecUpdate) < 0 ||
         rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10934,11 +15641,16 @@ cudaError_t cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
         rpc_read(0, resultInfo, sizeof(cudaGraphExecUpdateResultInfo)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)resultInfo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphUpload) < 0 ||
         rpc_write(0, &graphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10946,11 +15658,15 @@ cudaError_t cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream)
 {
+    maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphLaunch) < 0 ||
         rpc_write(0, &graphExec, sizeof(cudaGraphExec_t)) < 0 ||
@@ -10958,33 +15674,42 @@ cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphExecDestroy(cudaGraphExec_t graphExec)
 {
+    maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphExecDestroy) < 0 ||
         rpc_write(0, &graphExec, sizeof(cudaGraphExec_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphDestroy(cudaGraph_t graph)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphDestroy) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphDebugDotPrint(cudaGraph_t graph, const char* path, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)path, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphDebugDotPrint) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
@@ -10993,11 +15718,16 @@ cudaError_t cudaGraphDebugDotPrint(cudaGraph_t graph, const char* path, unsigned
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)path, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaUserObjectRetain(cudaUserObject_t object, unsigned int count)
 {
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaUserObjectRetain) < 0 ||
         rpc_write(0, &object, sizeof(cudaUserObject_t)) < 0 ||
@@ -11005,11 +15735,15 @@ cudaError_t cudaUserObjectRetain(cudaUserObject_t object, unsigned int count)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaUserObjectRelease(cudaUserObject_t object, unsigned int count)
 {
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaUserObjectRelease) < 0 ||
         rpc_write(0, &object, sizeof(cudaUserObject_t)) < 0 ||
@@ -11017,11 +15751,17 @@ cudaError_t cudaUserObjectRelease(cudaUserObject_t object, unsigned int count)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count, unsigned int flags)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphRetainUserObject) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
@@ -11031,11 +15771,18 @@ cudaError_t cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count)
 {
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGraphReleaseUserObject) < 0 ||
         rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 ||
@@ -11044,11 +15791,18 @@ cudaError_t cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t objec
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetDriverEntryPoint(const char* symbol, void** funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult* driverStatus)
 {
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)funcPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)driverStatus, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetDriverEntryPoint) < 0 ||
         rpc_write(0, &symbol, sizeof(const char*)) < 0 ||
@@ -11060,11 +15814,17 @@ cudaError_t cudaGetDriverEntryPoint(const char* symbol, void** funcPtr, unsigned
         rpc_read(0, driverStatus, sizeof(enum cudaDriverEntryPointQueryResult)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)funcPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)driverStatus, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pExportTableId)
 {
+    maybe_copy_unified_arg(0, (void*)ppExportTable, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)pExportTableId, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetExportTable) < 0 ||
         rpc_write(0, ppExportTable, sizeof(const void*)) < 0 ||
@@ -11073,11 +15833,15 @@ cudaError_t cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pEx
         rpc_read(0, ppExportTable, sizeof(const void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)ppExportTable, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)pExportTableId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudaError_t cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolPtr)
 {
+    maybe_copy_unified_arg(0, (void*)functionPtr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)symbolPtr, cudaMemcpyHostToDevice);
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaGetFuncBySymbol) < 0 ||
         rpc_write(0, functionPtr, sizeof(cudaFunction_t)) < 0 ||
@@ -11086,33 +15850,41 @@ cudaError_t cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolP
         rpc_read(0, functionPtr, sizeof(cudaFunction_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+    maybe_copy_unified_arg(0, (void*)functionPtr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)symbolPtr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCreate_v2(cublasHandle_t* handle)
 {
+    maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCreate_v2) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDestroy_v2(cublasHandle_t handle)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDestroy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGetVersion_v2(cublasHandle_t handle, int* version)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGetVersion_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11121,11 +15893,15 @@ cublasStatus_t cublasGetVersion_v2(cublasHandle_t handle, int* version)
         rpc_read(0, version, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGetProperty(libraryPropertyType type, int* value)
 {
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGetProperty) < 0 ||
         rpc_write(0, &type, sizeof(libraryPropertyType)) < 0 ||
@@ -11134,11 +15910,15 @@ cublasStatus_t cublasGetProperty(libraryPropertyType type, int* value)
         rpc_read(0, value, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&streamId, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSetStream_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11146,11 +15926,15 @@ cublasStatus_t cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&streamId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGetStream_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11159,11 +15943,15 @@ cublasStatus_t cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId)
         rpc_read(0, streamId, sizeof(cudaStream_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t* mode)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGetPointerMode_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11172,11 +15960,15 @@ cublasStatus_t cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_
         rpc_read(0, mode, sizeof(cublasPointerMode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t mode)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSetPointerMode_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11184,11 +15976,15 @@ cublasStatus_t cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* mode)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGetAtomicsMode) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11197,11 +15993,15 @@ cublasStatus_t cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t*
         rpc_read(0, mode, sizeof(cublasAtomicsMode_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSetAtomicsMode) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11209,11 +16009,15 @@ cublasStatus_t cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t m
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGetMathMode) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11222,11 +16026,15 @@ cublasStatus_t cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode)
         rpc_read(0, mode, sizeof(cublasMath_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSetMathMode) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11234,11 +16042,15 @@ cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)smCountTarget, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGetSmCountTarget) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11247,11 +16059,15 @@ cublasStatus_t cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget)
         rpc_read(0, smCountTarget, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)smCountTarget, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&smCountTarget, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSetSmCountTarget) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11259,11 +16075,17 @@ cublasStatus_t cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&smCountTarget, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasLoggerConfigure(int logIsOn, int logToStdOut, int logToStdErr, const char* logFileName)
 {
+    maybe_copy_unified_arg(0, (void*)&logIsOn, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&logToStdOut, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&logToStdErr, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)logFileName, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasLoggerConfigure) < 0 ||
         rpc_write(0, &logIsOn, sizeof(int)) < 0 ||
@@ -11273,22 +16095,29 @@ cublasStatus_t cublasLoggerConfigure(int logIsOn, int logToStdOut, int logToStdE
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&logIsOn, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&logToStdOut, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&logToStdErr, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)logFileName, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSetLoggerCallback(cublasLogCallback userCallback)
 {
+    maybe_copy_unified_arg(0, (void*)&userCallback, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSetLoggerCallback) < 0 ||
         rpc_write(0, &userCallback, sizeof(cublasLogCallback)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&userCallback, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGetLoggerCallback(cublasLogCallback* userCallback)
 {
+    maybe_copy_unified_arg(0, (void*)userCallback, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGetLoggerCallback) < 0 ||
         rpc_write(0, userCallback, sizeof(cublasLogCallback)) < 0 ||
@@ -11296,11 +16125,17 @@ cublasStatus_t cublasGetLoggerCallback(cublasLogCallback* userCallback)
         rpc_read(0, userCallback, sizeof(cublasLogCallback)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)userCallback, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSnrm2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11312,11 +16147,21 @@ cublasStatus_t cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSnrm2_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSnrm2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11328,11 +16173,21 @@ cublasStatus_t cublasSnrm2_v2_64(cublasHandle_t handle, int64_t n, const float*
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDnrm2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11344,11 +16199,21 @@ cublasStatus_t cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDnrm2_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDnrm2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11360,11 +16225,21 @@ cublasStatus_t cublasDnrm2_v2_64(cublasHandle_t handle, int64_t n, const double*
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasScnrm2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11376,11 +16251,21 @@ cublasStatus_t cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x,
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasScnrm2_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasScnrm2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11392,11 +16277,21 @@ cublasStatus_t cublasScnrm2_v2_64(cublasHandle_t handle, int64_t n, const cuComp
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDznrm2_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDznrm2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11408,11 +16303,21 @@ cublasStatus_t cublasDznrm2_v2(cublasHandle_t handle, int n, const cuDoubleCompl
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDznrm2_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDznrm2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11424,11 +16329,23 @@ cublasStatus_t cublasDznrm2_v2_64(cublasHandle_t handle, int64_t n, const cuDoub
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSdot_v2(cublasHandle_t handle, int n, const float* x, int incx, const float* y, int incy, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSdot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11442,11 +16359,25 @@ cublasStatus_t cublasSdot_v2(cublasHandle_t handle, int n, const float* x, int i
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSdot_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, const float* y, int64_t incy, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSdot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11460,11 +16391,25 @@ cublasStatus_t cublasSdot_v2_64(cublasHandle_t handle, int64_t n, const float* x
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDdot_v2(cublasHandle_t handle, int n, const double* x, int incx, const double* y, int incy, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDdot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11478,11 +16423,25 @@ cublasStatus_t cublasDdot_v2(cublasHandle_t handle, int n, const double* x, int
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDdot_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, const double* y, int64_t incy, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDdot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11496,11 +16455,25 @@ cublasStatus_t cublasDdot_v2_64(cublasHandle_t handle, int64_t n, const double*
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCdotu_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCdotu_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11514,11 +16487,25 @@ cublasStatus_t cublasCdotu_v2(cublasHandle_t handle, int n, const cuComplex* x,
         rpc_read(0, result, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCdotu_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCdotu_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11532,11 +16519,25 @@ cublasStatus_t cublasCdotu_v2_64(cublasHandle_t handle, int64_t n, const cuCompl
         rpc_read(0, result, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCdotc_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCdotc_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11550,11 +16551,25 @@ cublasStatus_t cublasCdotc_v2(cublasHandle_t handle, int n, const cuComplex* x,
         rpc_read(0, result, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCdotc_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCdotc_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11568,11 +16583,25 @@ cublasStatus_t cublasCdotc_v2_64(cublasHandle_t handle, int64_t n, const cuCompl
         rpc_read(0, result, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdotu_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdotu_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11586,11 +16615,25 @@ cublasStatus_t cublasZdotu_v2(cublasHandle_t handle, int n, const cuDoubleComple
         rpc_read(0, result, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdotu_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdotu_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11604,11 +16647,25 @@ cublasStatus_t cublasZdotu_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl
         rpc_read(0, result, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdotc_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdotc_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11622,11 +16679,25 @@ cublasStatus_t cublasZdotc_v2(cublasHandle_t handle, int n, const cuDoubleComple
         rpc_read(0, result, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdotc_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdotc_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11640,11 +16711,23 @@ cublasStatus_t cublasZdotc_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl
         rpc_read(0, result, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSscal_v2(cublasHandle_t handle, int n, const float* alpha, float* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSscal_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11656,11 +16739,21 @@ cublasStatus_t cublasSscal_v2(cublasHandle_t handle, int n, const float* alpha,
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSscal_v2_64(cublasHandle_t handle, int64_t n, const float* alpha, float* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSscal_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11672,11 +16765,21 @@ cublasStatus_t cublasSscal_v2_64(cublasHandle_t handle, int64_t n, const float*
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDscal_v2(cublasHandle_t handle, int n, const double* alpha, double* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDscal_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11688,11 +16791,21 @@ cublasStatus_t cublasDscal_v2(cublasHandle_t handle, int n, const double* alpha,
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDscal_v2_64(cublasHandle_t handle, int64_t n, const double* alpha, double* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDscal_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11704,11 +16817,21 @@ cublasStatus_t cublasDscal_v2_64(cublasHandle_t handle, int64_t n, const double*
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCscal_v2(cublasHandle_t handle, int n, const cuComplex* alpha, cuComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCscal_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11720,11 +16843,21 @@ cublasStatus_t cublasCscal_v2(cublasHandle_t handle, int n, const cuComplex* alp
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCscal_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* alpha, cuComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCscal_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11736,11 +16869,21 @@ cublasStatus_t cublasCscal_v2_64(cublasHandle_t handle, int64_t n, const cuCompl
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsscal_v2(cublasHandle_t handle, int n, const float* alpha, cuComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsscal_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11752,11 +16895,21 @@ cublasStatus_t cublasCsscal_v2(cublasHandle_t handle, int n, const float* alpha,
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsscal_v2_64(cublasHandle_t handle, int64_t n, const float* alpha, cuComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsscal_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11768,11 +16921,21 @@ cublasStatus_t cublasCsscal_v2_64(cublasHandle_t handle, int64_t n, const float*
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZscal_v2(cublasHandle_t handle, int n, const cuDoubleComplex* alpha, cuDoubleComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZscal_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11784,11 +16947,21 @@ cublasStatus_t cublasZscal_v2(cublasHandle_t handle, int n, const cuDoubleComple
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZscal_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* alpha, cuDoubleComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZscal_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11800,11 +16973,21 @@ cublasStatus_t cublasZscal_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdscal_v2(cublasHandle_t handle, int n, const double* alpha, cuDoubleComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdscal_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11816,11 +16999,21 @@ cublasStatus_t cublasZdscal_v2(cublasHandle_t handle, int n, const double* alpha
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdscal_v2_64(cublasHandle_t handle, int64_t n, const double* alpha, cuDoubleComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdscal_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11832,11 +17025,23 @@ cublasStatus_t cublasZdscal_v2_64(cublasHandle_t handle, int64_t n, const double
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSaxpy_v2(cublasHandle_t handle, int n, const float* alpha, const float* x, int incx, float* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSaxpy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11850,11 +17055,25 @@ cublasStatus_t cublasSaxpy_v2(cublasHandle_t handle, int n, const float* alpha,
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSaxpy_v2_64(cublasHandle_t handle, int64_t n, const float* alpha, const float* x, int64_t incx, float* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSaxpy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11868,11 +17087,25 @@ cublasStatus_t cublasSaxpy_v2_64(cublasHandle_t handle, int64_t n, const float*
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDaxpy_v2(cublasHandle_t handle, int n, const double* alpha, const double* x, int incx, double* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDaxpy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11886,11 +17119,25 @@ cublasStatus_t cublasDaxpy_v2(cublasHandle_t handle, int n, const double* alpha,
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDaxpy_v2_64(cublasHandle_t handle, int64_t n, const double* alpha, const double* x, int64_t incx, double* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDaxpy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11904,11 +17151,25 @@ cublasStatus_t cublasDaxpy_v2_64(cublasHandle_t handle, int64_t n, const double*
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCaxpy_v2(cublasHandle_t handle, int n, const cuComplex* alpha, const cuComplex* x, int incx, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCaxpy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11922,11 +17183,25 @@ cublasStatus_t cublasCaxpy_v2(cublasHandle_t handle, int n, const cuComplex* alp
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCaxpy_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCaxpy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11940,11 +17215,25 @@ cublasStatus_t cublasCaxpy_v2_64(cublasHandle_t handle, int64_t n, const cuCompl
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZaxpy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZaxpy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11958,11 +17247,25 @@ cublasStatus_t cublasZaxpy_v2(cublasHandle_t handle, int n, const cuDoubleComple
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZaxpy_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZaxpy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11976,11 +17279,24 @@ cublasStatus_t cublasZaxpy_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasScopy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -11993,11 +17309,23 @@ cublasStatus_t cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasScopy_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasScopy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12010,11 +17338,23 @@ cublasStatus_t cublasScopy_v2_64(cublasHandle_t handle, int64_t n, const float*
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDcopy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12027,11 +17367,23 @@ cublasStatus_t cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDcopy_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDcopy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12044,11 +17396,23 @@ cublasStatus_t cublasDcopy_v2_64(cublasHandle_t handle, int64_t n, const double*
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCcopy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12061,11 +17425,23 @@ cublasStatus_t cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x,
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCcopy_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCcopy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12078,11 +17454,23 @@ cublasStatus_t cublasCcopy_v2_64(cublasHandle_t handle, int64_t n, const cuCompl
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZcopy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12095,11 +17483,23 @@ cublasStatus_t cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComple
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZcopy_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZcopy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12112,11 +17512,23 @@ cublasStatus_t cublasZcopy_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSswap_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12130,11 +17542,23 @@ cublasStatus_t cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx,
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSswap_v2_64(cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSswap_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12148,11 +17572,23 @@ cublasStatus_t cublasSswap_v2_64(cublasHandle_t handle, int64_t n, float* x, int
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDswap_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12166,11 +17602,23 @@ cublasStatus_t cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx,
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDswap_v2_64(cublasHandle_t handle, int64_t n, double* x, int64_t incx, double* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDswap_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12184,11 +17632,23 @@ cublasStatus_t cublasDswap_v2_64(cublasHandle_t handle, int64_t n, double* x, in
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCswap_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12202,11 +17662,23 @@ cublasStatus_t cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int in
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCswap_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, int64_t incx, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCswap_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12220,11 +17692,23 @@ cublasStatus_t cublasCswap_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x,
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZswap_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12238,11 +17722,23 @@ cublasStatus_t cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x,
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZswap_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZswap_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12256,11 +17752,22 @@ cublasStatus_t cublasZswap_v2_64(cublasHandle_t handle, int64_t n, cuDoubleCompl
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIsamax_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12272,11 +17779,21 @@ cublasStatus_t cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIsamax_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIsamax_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12288,11 +17805,21 @@ cublasStatus_t cublasIsamax_v2_64(cublasHandle_t handle, int64_t n, const float*
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIdamax_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12304,11 +17831,21 @@ cublasStatus_t cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, in
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIdamax_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIdamax_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12320,11 +17857,21 @@ cublasStatus_t cublasIdamax_v2_64(cublasHandle_t handle, int64_t n, const double
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIcamax_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12336,11 +17883,21 @@ cublasStatus_t cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x,
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIcamax_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIcamax_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12352,11 +17909,21 @@ cublasStatus_t cublasIcamax_v2_64(cublasHandle_t handle, int64_t n, const cuComp
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIzamax_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIzamax_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12368,11 +17935,21 @@ cublasStatus_t cublasIzamax_v2(cublasHandle_t handle, int n, const cuDoubleCompl
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIzamax_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIzamax_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12384,11 +17961,22 @@ cublasStatus_t cublasIzamax_v2_64(cublasHandle_t handle, int64_t n, const cuDoub
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIamaxEx(cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIamaxEx) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12401,11 +17989,23 @@ cublasStatus_t cublasIamaxEx(cublasHandle_t handle, int n, const void* x, cudaDa
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIamaxEx_64(cublasHandle_t handle, int64_t n, const void* x, cudaDataType xType, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIamaxEx_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12418,11 +18018,22 @@ cublasStatus_t cublasIamaxEx_64(cublasHandle_t handle, int64_t n, const void* x,
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIsamin_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12434,11 +18045,21 @@ cublasStatus_t cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIsamin_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIsamin_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12450,11 +18071,21 @@ cublasStatus_t cublasIsamin_v2_64(cublasHandle_t handle, int64_t n, const float*
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIdamin_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12466,11 +18097,21 @@ cublasStatus_t cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, in
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIdamin_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIdamin_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12482,11 +18123,21 @@ cublasStatus_t cublasIdamin_v2_64(cublasHandle_t handle, int64_t n, const double
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIcamin_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12498,11 +18149,21 @@ cublasStatus_t cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x,
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIcamin_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIcamin_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12514,11 +18175,21 @@ cublasStatus_t cublasIcamin_v2_64(cublasHandle_t handle, int64_t n, const cuComp
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIzamin_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIzamin_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12530,11 +18201,21 @@ cublasStatus_t cublasIzamin_v2(cublasHandle_t handle, int n, const cuDoubleCompl
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIzamin_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIzamin_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12546,11 +18227,22 @@ cublasStatus_t cublasIzamin_v2_64(cublasHandle_t handle, int64_t n, const cuDoub
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIaminEx(cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIaminEx) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12563,11 +18255,23 @@ cublasStatus_t cublasIaminEx(cublasHandle_t handle, int n, const void* x, cudaDa
         rpc_read(0, result, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasIaminEx_64(cublasHandle_t handle, int64_t n, const void* x, cudaDataType xType, int64_t incx, int64_t* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasIaminEx_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12580,11 +18284,22 @@ cublasStatus_t cublasIaminEx_64(cublasHandle_t handle, int64_t n, const void* x,
         rpc_read(0, result, sizeof(int64_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSasum_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12596,11 +18311,21 @@ cublasStatus_t cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSasum_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSasum_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12612,11 +18337,21 @@ cublasStatus_t cublasSasum_v2_64(cublasHandle_t handle, int64_t n, const float*
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDasum_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12628,11 +18363,21 @@ cublasStatus_t cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDasum_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDasum_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12644,11 +18389,21 @@ cublasStatus_t cublasDasum_v2_64(cublasHandle_t handle, int64_t n, const double*
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasScasum_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12660,11 +18415,21 @@ cublasStatus_t cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x,
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasScasum_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, float* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasScasum_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12676,11 +18441,21 @@ cublasStatus_t cublasScasum_v2_64(cublasHandle_t handle, int64_t n, const cuComp
         rpc_read(0, result, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDzasum_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDzasum_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12692,11 +18467,21 @@ cublasStatus_t cublasDzasum_v2(cublasHandle_t handle, int n, const cuDoubleCompl
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDzasum_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, double* result)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDzasum_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12708,11 +18493,24 @@ cublasStatus_t cublasDzasum_v2_64(cublasHandle_t handle, int64_t n, const cuDoub
         rpc_read(0, result, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSrot_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, const float* c, const float* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSrot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12728,11 +18526,27 @@ cublasStatus_t cublasSrot_v2(cublasHandle_t handle, int n, float* x, int incx, f
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSrot_v2_64(cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy, const float* c, const float* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSrot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12748,11 +18562,27 @@ cublasStatus_t cublasSrot_v2_64(cublasHandle_t handle, int64_t n, float* x, int6
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDrot_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, const double* c, const double* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDrot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12768,11 +18598,27 @@ cublasStatus_t cublasDrot_v2(cublasHandle_t handle, int n, double* x, int incx,
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDrot_v2_64(cublasHandle_t handle, int64_t n, double* x, int64_t incx, double* y, int64_t incy, const double* c, const double* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDrot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12788,11 +18634,27 @@ cublasStatus_t cublasDrot_v2_64(cublasHandle_t handle, int64_t n, double* x, int
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCrot_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy, const float* c, const cuComplex* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCrot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12808,11 +18670,27 @@ cublasStatus_t cublasCrot_v2(cublasHandle_t handle, int n, cuComplex* x, int inc
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCrot_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, int64_t incx, cuComplex* y, int64_t incy, const float* c, const cuComplex* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCrot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12828,11 +18706,27 @@ cublasStatus_t cublasCrot_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x,
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsrot_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy, const float* c, const float* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsrot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12848,11 +18742,27 @@ cublasStatus_t cublasCsrot_v2(cublasHandle_t handle, int n, cuComplex* x, int in
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsrot_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, int64_t incx, cuComplex* y, int64_t incy, const float* c, const float* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsrot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12868,11 +18778,27 @@ cublasStatus_t cublasCsrot_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x,
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, const double* c, const cuDoubleComplex* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZrot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12888,11 +18814,27 @@ cublasStatus_t cublasZrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, i
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy, const double* c, const cuDoubleComplex* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZrot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12908,11 +18850,27 @@ cublasStatus_t cublasZrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComple
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, const double* c, const double* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdrot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12928,11 +18886,27 @@ cublasStatus_t cublasZdrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x,
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy, const double* c, const double* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdrot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12948,11 +18922,24 @@ cublasStatus_t cublasZdrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleCompl
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSrotg_v2(cublasHandle_t handle, float* a, float* b, float* c, float* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)a, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)b, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSrotg_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12967,11 +18954,21 @@ cublasStatus_t cublasSrotg_v2(cublasHandle_t handle, float* a, float* b, float*
         rpc_read(0, s, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)a, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)b, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDrotg_v2(cublasHandle_t handle, double* a, double* b, double* c, double* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)a, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)b, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDrotg_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -12986,11 +18983,21 @@ cublasStatus_t cublasDrotg_v2(cublasHandle_t handle, double* a, double* b, doubl
         rpc_read(0, s, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)a, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)b, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCrotg_v2(cublasHandle_t handle, cuComplex* a, cuComplex* b, float* c, cuComplex* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)a, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)b, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCrotg_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13005,11 +19012,21 @@ cublasStatus_t cublasCrotg_v2(cublasHandle_t handle, cuComplex* a, cuComplex* b,
         rpc_read(0, s, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)a, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)b, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZrotg_v2(cublasHandle_t handle, cuDoubleComplex* a, cuDoubleComplex* b, double* c, cuDoubleComplex* s)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)a, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)b, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZrotg_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13024,11 +19041,23 @@ cublasStatus_t cublasZrotg_v2(cublasHandle_t handle, cuDoubleComplex* a, cuDoubl
         rpc_read(0, s, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)a, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)b, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSrotm_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, const float* param)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSrotm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13043,11 +19072,25 @@ cublasStatus_t cublasSrotm_v2(cublasHandle_t handle, int n, float* x, int incx,
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSrotm_v2_64(cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy, const float* param)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSrotm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13062,11 +19105,25 @@ cublasStatus_t cublasSrotm_v2_64(cublasHandle_t handle, int64_t n, float* x, int
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDrotm_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, const double* param)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDrotm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13081,11 +19138,25 @@ cublasStatus_t cublasDrotm_v2(cublasHandle_t handle, int n, double* x, int incx,
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDrotm_v2_64(cublasHandle_t handle, int64_t n, double* x, int64_t incx, double* y, int64_t incy, const double* param)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDrotm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13100,11 +19171,24 @@ cublasStatus_t cublasDrotm_v2_64(cublasHandle_t handle, int64_t n, double* x, in
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSrotmg_v2(cublasHandle_t handle, float* d1, float* d2, float* x1, const float* y1, float* param)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)d1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)d2, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSrotmg_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13120,11 +19204,23 @@ cublasStatus_t cublasSrotmg_v2(cublasHandle_t handle, float* d1, float* d2, floa
         rpc_read(0, param, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)d1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)d2, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDrotmg_v2(cublasHandle_t handle, double* d1, double* d2, double* x1, const double* y1, double* param)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)d1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)d2, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y1, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDrotmg_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13140,11 +19236,29 @@ cublasStatus_t cublasDrotmg_v2(cublasHandle_t handle, double* d1, double* d2, do
         rpc_read(0, param, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)d1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)d2, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y1, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const float* A, int lda, const float* x, int incx, const float* beta, float* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13163,11 +19277,35 @@ cublasStatus_t cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* x, int64_t incx, const float* beta, float* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13186,11 +19324,35 @@ cublasStatus_t cublasSgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double* alpha, const double* A, int lda, const double* x, int incx, const double* beta, double* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13209,11 +19371,35 @@ cublasStatus_t cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* x, int64_t incx, const double* beta, double* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13232,11 +19418,35 @@ cublasStatus_t cublasDgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13255,11 +19465,35 @@ cublasStatus_t cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13278,11 +19512,35 @@ cublasStatus_t cublasCgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13301,11 +19559,35 @@ cublasStatus_t cublasZgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13324,11 +19606,37 @@ cublasStatus_t cublasZgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl, int ku, const float* alpha, const float* A, int lda, const float* x, int incx, const float* beta, float* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13349,11 +19657,39 @@ cublasStatus_t cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, int64_t kl, int64_t ku, const float* alpha, const float* A, int64_t lda, const float* x, int64_t incx, const float* beta, float* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13374,11 +19710,39 @@ cublasStatus_t cublasSgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl, int ku, const double* alpha, const double* A, int lda, const double* x, int incx, const double* beta, double* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13399,11 +19763,39 @@ cublasStatus_t cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, int64_t kl, int64_t ku, const double* alpha, const double* A, int64_t lda, const double* x, int64_t incx, const double* beta, double* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13424,11 +19816,39 @@ cublasStatus_t cublasDgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl, int ku, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13449,11 +19869,39 @@ cublasStatus_t cublasCgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, int64_t kl, int64_t ku, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13474,11 +19922,39 @@ cublasStatus_t cublasCgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl, int ku, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13499,11 +19975,39 @@ cublasStatus_t cublasZgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, int64_t kl, int64_t ku, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13524,11 +20028,34 @@ cublasStatus_t cublasZgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const float* A, int lda, float* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13544,11 +20071,29 @@ cublasStatus_t cublasStrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const float* A, int64_t lda, float* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13564,11 +20109,29 @@ cublasStatus_t cublasStrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double* A, int lda, double* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13584,11 +20147,29 @@ cublasStatus_t cublasDtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const double* A, int64_t lda, double* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13604,11 +20185,29 @@ cublasStatus_t cublasDtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13624,11 +20223,29 @@ cublasStatus_t cublasCtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuComplex* A, int64_t lda, cuComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13644,11 +20261,29 @@ cublasStatus_t cublasCtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13664,11 +20299,29 @@ cublasStatus_t cublasZtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13684,11 +20337,30 @@ cublasStatus_t cublasZtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const float* A, int lda, float* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13705,11 +20377,31 @@ cublasStatus_t cublasStbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const float* A, int64_t lda, float* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13726,11 +20418,31 @@ cublasStatus_t cublasStbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const double* A, int lda, double* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13747,11 +20459,31 @@ cublasStatus_t cublasDtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const double* A, int64_t lda, double* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13768,11 +20500,31 @@ cublasStatus_t cublasDtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13789,11 +20541,31 @@ cublasStatus_t cublasCtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const cuComplex* A, int64_t lda, cuComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13810,11 +20582,31 @@ cublasStatus_t cublasCtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13831,11 +20623,31 @@ cublasStatus_t cublasZtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13852,11 +20664,29 @@ cublasStatus_t cublasZtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const float* AP, float* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStpmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13871,11 +20701,27 @@ cublasStatus_t cublasStpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const float* AP, float* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStpmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13890,11 +20736,27 @@ cublasStatus_t cublasStpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double* AP, double* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtpmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13909,11 +20771,27 @@ cublasStatus_t cublasDtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const double* AP, double* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtpmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13928,11 +20806,27 @@ cublasStatus_t cublasDtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuComplex* AP, cuComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtpmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13947,11 +20841,27 @@ cublasStatus_t cublasCtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuComplex* AP, cuComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtpmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13966,11 +20876,27 @@ cublasStatus_t cublasCtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtpmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -13985,11 +20911,27 @@ cublasStatus_t cublasZtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuDoubleComplex* AP, cuDoubleComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtpmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14004,11 +20946,28 @@ cublasStatus_t cublasZtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const float* A, int lda, float* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14024,11 +20983,29 @@ cublasStatus_t cublasStrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const float* A, int64_t lda, float* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14044,11 +21021,29 @@ cublasStatus_t cublasStrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double* A, int lda, double* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14064,11 +21059,29 @@ cublasStatus_t cublasDtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const double* A, int64_t lda, double* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14084,11 +21097,29 @@ cublasStatus_t cublasDtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14104,11 +21135,29 @@ cublasStatus_t cublasCtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuComplex* A, int64_t lda, cuComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14124,11 +21173,29 @@ cublasStatus_t cublasCtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14144,11 +21211,29 @@ cublasStatus_t cublasZtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14164,11 +21249,28 @@ cublasStatus_t cublasZtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const float* AP, float* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStpsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14183,11 +21285,27 @@ cublasStatus_t cublasStpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const float* AP, float* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStpsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14202,11 +21320,27 @@ cublasStatus_t cublasStpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double* AP, double* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtpsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14221,11 +21355,27 @@ cublasStatus_t cublasDtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const double* AP, double* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtpsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14240,11 +21390,27 @@ cublasStatus_t cublasDtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuComplex* AP, cuComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtpsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14259,11 +21425,27 @@ cublasStatus_t cublasCtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuComplex* AP, cuComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtpsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14278,11 +21460,27 @@ cublasStatus_t cublasCtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtpsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14297,11 +21495,27 @@ cublasStatus_t cublasZtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuDoubleComplex* AP, cuDoubleComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtpsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14316,11 +21530,29 @@ cublasStatus_t cublasZtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const float* A, int lda, float* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStbsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14337,11 +21569,31 @@ cublasStatus_t cublasStbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const float* A, int64_t lda, float* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStbsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14358,11 +21610,31 @@ cublasStatus_t cublasStbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const double* A, int lda, double* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtbsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14379,11 +21651,31 @@ cublasStatus_t cublasDtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const double* A, int64_t lda, double* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtbsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14400,11 +21692,31 @@ cublasStatus_t cublasDtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtbsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14421,11 +21733,31 @@ cublasStatus_t cublasCtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const cuComplex* A, int64_t lda, cuComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtbsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14442,11 +21774,31 @@ cublasStatus_t cublasCtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtbsv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14463,11 +21815,31 @@ cublasStatus_t cublasZtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* x, int64_t incx)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtbsv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14484,11 +21856,32 @@ cublasStatus_t cublasZtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* A, int lda, const float* x, int incx, const float* beta, float* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsymv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14506,11 +21899,33 @@ cublasStatus_t cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* A, int64_t lda, const float* x, int64_t incx, const float* beta, float* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsymv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14528,11 +21943,33 @@ cublasStatus_t cublasSsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* A, int lda, const double* x, int incx, const double* beta, double* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsymv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14550,11 +21987,33 @@ cublasStatus_t cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* A, int64_t lda, const double* x, int64_t incx, const double* beta, double* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsymv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14572,11 +22031,33 @@ cublasStatus_t cublasDsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsymv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14594,11 +22075,33 @@ cublasStatus_t cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsymv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14616,11 +22119,33 @@ cublasStatus_t cublasCsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsymv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14638,11 +22163,33 @@ cublasStatus_t cublasZsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsymv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14660,11 +22207,33 @@ cublasStatus_t cublasZsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChemv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14682,11 +22251,33 @@ cublasStatus_t cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChemv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChemv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14704,11 +22295,33 @@ cublasStatus_t cublasChemv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhemv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14726,11 +22339,33 @@ cublasStatus_t cublasZhemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhemv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhemv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14748,11 +22383,34 @@ cublasStatus_t cublasZhemv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k, const float* alpha, const float* A, int lda, const float* x, int incx, const float* beta, float* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14771,11 +22429,35 @@ cublasStatus_t cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* x, int64_t incx, const float* beta, float* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14794,11 +22476,35 @@ cublasStatus_t cublasSsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k, const double* alpha, const double* A, int lda, const double* x, int incx, const double* beta, double* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14817,11 +22523,35 @@ cublasStatus_t cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* x, int64_t incx, const double* beta, double* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14840,11 +22570,35 @@ cublasStatus_t cublasDsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14863,11 +22617,35 @@ cublasStatus_t cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14886,11 +22664,35 @@ cublasStatus_t cublasChbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhbmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14909,11 +22711,35 @@ cublasStatus_t cublasZhbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhbmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14932,11 +22758,33 @@ cublasStatus_t cublasZhbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* AP, const float* x, int incx, const float* beta, float* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSspmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14953,11 +22801,31 @@ cublasStatus_t cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* AP, const float* x, int64_t incx, const float* beta, float* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSspmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14974,11 +22842,31 @@ cublasStatus_t cublasSspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* AP, const double* x, int incx, const double* beta, double* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDspmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -14995,11 +22883,31 @@ cublasStatus_t cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* AP, const double* x, int64_t incx, const double* beta, double* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDspmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15016,11 +22924,31 @@ cublasStatus_t cublasDspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* AP, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChpmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15037,11 +22965,31 @@ cublasStatus_t cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* AP, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChpmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15058,11 +23006,31 @@ cublasStatus_t cublasChpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* AP, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhpmv_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15079,11 +23047,31 @@ cublasStatus_t cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* AP, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhpmv_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15100,11 +23088,31 @@ cublasStatus_t cublasZhpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSger_v2(cublasHandle_t handle, int m, int n, const float* alpha, const float* x, int incx, const float* y, int incy, float* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSger_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15121,11 +23129,31 @@ cublasStatus_t cublasSger_v2(cublasHandle_t handle, int m, int n, const float* a
         rpc_read(0, A, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const float* alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSger_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15142,11 +23170,31 @@ cublasStatus_t cublasSger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, con
         rpc_read(0, A, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDger_v2(cublasHandle_t handle, int m, int n, const double* alpha, const double* x, int incx, const double* y, int incy, double* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDger_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15163,11 +23211,31 @@ cublasStatus_t cublasDger_v2(cublasHandle_t handle, int m, int n, const double*
         rpc_read(0, A, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const double* alpha, const double* x, int64_t incx, const double* y, int64_t incy, double* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDger_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15184,11 +23252,31 @@ cublasStatus_t cublasDger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, con
         rpc_read(0, A, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgeru_v2(cublasHandle_t handle, int m, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgeru_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15205,11 +23293,31 @@ cublasStatus_t cublasCgeru_v2(cublasHandle_t handle, int m, int n, const cuCompl
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgeru_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgeru_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15226,11 +23334,31 @@ cublasStatus_t cublasCgeru_v2_64(cublasHandle_t handle, int64_t m, int64_t n, co
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgerc_v2(cublasHandle_t handle, int m, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgerc_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15247,11 +23375,31 @@ cublasStatus_t cublasCgerc_v2(cublasHandle_t handle, int m, int n, const cuCompl
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgerc_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgerc_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15268,11 +23416,31 @@ cublasStatus_t cublasCgerc_v2_64(cublasHandle_t handle, int64_t m, int64_t n, co
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgeru_v2(cublasHandle_t handle, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgeru_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15289,11 +23457,31 @@ cublasStatus_t cublasZgeru_v2(cublasHandle_t handle, int m, int n, const cuDoubl
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgeru_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgeru_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15310,11 +23498,31 @@ cublasStatus_t cublasZgeru_v2_64(cublasHandle_t handle, int64_t m, int64_t n, co
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgerc_v2(cublasHandle_t handle, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgerc_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15331,11 +23539,31 @@ cublasStatus_t cublasZgerc_v2(cublasHandle_t handle, int m, int n, const cuDoubl
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgerc_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgerc_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15352,11 +23580,29 @@ cublasStatus_t cublasZgerc_v2_64(cublasHandle_t handle, int64_t m, int64_t n, co
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, float* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyr_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15371,11 +23617,27 @@ cublasStatus_t cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, A, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* x, int64_t incx, float* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyr_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15390,11 +23652,27 @@ cublasStatus_t cublasSsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, A, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, double* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyr_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15409,11 +23687,27 @@ cublasStatus_t cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, A, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* x, int64_t incx, double* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyr_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15428,11 +23722,27 @@ cublasStatus_t cublasDsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, A, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* x, int incx, cuComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyr_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15447,11 +23757,27 @@ cublasStatus_t cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, cuComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyr_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15466,11 +23792,27 @@ cublasStatus_t cublasCsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyr_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15485,11 +23827,27 @@ cublasStatus_t cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyr_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15504,11 +23862,27 @@ cublasStatus_t cublasZsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const cuComplex* x, int incx, cuComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCher_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15523,11 +23897,27 @@ cublasStatus_t cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const cuComplex* x, int64_t incx, cuComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCher_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15542,11 +23932,27 @@ cublasStatus_t cublasCher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZher_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15561,11 +23967,27 @@ cublasStatus_t cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZher_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15580,11 +24002,26 @@ cublasStatus_t cublasZher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, float* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSspr_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15598,11 +24035,25 @@ cublasStatus_t cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, AP, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* x, int64_t incx, float* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSspr_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15616,11 +24067,25 @@ cublasStatus_t cublasSspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, AP, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, double* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDspr_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15634,11 +24099,25 @@ cublasStatus_t cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, AP, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* x, int64_t incx, double* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDspr_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15652,11 +24131,25 @@ cublasStatus_t cublasDspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, AP, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const cuComplex* x, int incx, cuComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChpr_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15670,11 +24163,25 @@ cublasStatus_t cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, AP, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const cuComplex* x, int64_t incx, cuComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChpr_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15688,11 +24195,25 @@ cublasStatus_t cublasChpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, AP, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhpr_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15706,11 +24227,25 @@ cublasStatus_t cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhpr_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15724,11 +24259,28 @@ cublasStatus_t cublasZhpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, const float* y, int incy, float* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyr2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15745,11 +24297,31 @@ cublasStatus_t cublasSsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, A, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyr2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15766,11 +24338,31 @@ cublasStatus_t cublasSsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, A, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, const double* y, int incy, double* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyr2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15787,11 +24379,31 @@ cublasStatus_t cublasDsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, A, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* x, int64_t incx, const double* y, int64_t incy, double* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyr2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15808,11 +24420,31 @@ cublasStatus_t cublasDsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, A, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyr2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15829,11 +24461,31 @@ cublasStatus_t cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyr2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15850,11 +24502,31 @@ cublasStatus_t cublasCsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyr2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15871,11 +24543,31 @@ cublasStatus_t cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyr2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15892,11 +24584,31 @@ cublasStatus_t cublasZsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCher2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15913,11 +24625,31 @@ cublasStatus_t cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCher2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCher2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15934,11 +24666,31 @@ cublasStatus_t cublasCher2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZher2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15955,11 +24707,31 @@ cublasStatus_t cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZher2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* A, int64_t lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZher2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15976,11 +24748,30 @@ cublasStatus_t cublasZher2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, const float* y, int incy, float* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSspr2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -15996,11 +24787,29 @@ cublasStatus_t cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, AP, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSspr2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16016,11 +24825,29 @@ cublasStatus_t cublasSspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, AP, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, const double* y, int incy, double* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDspr2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16036,11 +24863,29 @@ cublasStatus_t cublasDspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, AP, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* x, int64_t incx, const double* y, int64_t incy, double* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDspr2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16056,11 +24901,29 @@ cublasStatus_t cublasDspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, AP, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChpr2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16076,11 +24939,29 @@ cublasStatus_t cublasChpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, AP, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChpr2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16096,11 +24977,29 @@ cublasStatus_t cublasChpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, AP, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhpr2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16116,11 +25015,29 @@ cublasStatus_t cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhpr2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16136,11 +25053,39 @@ cublasStatus_t cublasZhpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const float* const Aarray[], int lda, const float* const xarray[], int incx, const float* beta, float* const yarray[], int incy, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)xarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)xarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)yarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)yarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemvBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -16159,11 +25104,49 @@ cublasStatus_t cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)xarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)xarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)yarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)yarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasTSTgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __nv_bfloat16* const Aarray[], int lda, const __nv_bfloat16* const xarray[], int incx, const float* beta, __nv_bfloat16* const yarray[], int incy, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)xarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)xarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)yarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)yarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasTSTgemvBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -16182,11 +25165,46 @@ cublasStatus_t cublasTSTgemvBatched(cublasHandle_t handle, cublasOperation_t tra
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)xarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)xarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)yarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)yarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const float* A, int lda, long long int strideA, const float* x, int incx, long long int stridex, const float* beta, float* y, int incy, long long int stridey, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemvStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16209,11 +25227,43 @@ cublasStatus_t cublasSgemvStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, long long int strideA, const float* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemvStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16236,11 +25286,43 @@ cublasStatus_t cublasSgemvStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double* alpha, const double* A, int lda, long long int strideA, const double* x, int incx, long long int stridex, const double* beta, double* y, int incy, long long int stridey, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemvStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16263,11 +25345,43 @@ cublasStatus_t cublasDgemvStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, long long int strideA, const double* x, int64_t incx, long long int stridex, const double* beta, double* y, int64_t incy, long long int stridey, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemvStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16290,11 +25404,43 @@ cublasStatus_t cublasDgemvStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, y, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* x, int incx, long long int stridex, const cuComplex* beta, cuComplex* y, int incy, long long int stridey, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemvStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16317,11 +25463,43 @@ cublasStatus_t cublasCgemvStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* x, int64_t incx, long long int stridex, const cuComplex* beta, cuComplex* y, int64_t incy, long long int stridey, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemvStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16344,11 +25522,43 @@ cublasStatus_t cublasCgemvStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, y, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, long long int strideA, const cuDoubleComplex* x, int incx, long long int stridex, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy, long long int stridey, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemvStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16371,11 +25581,43 @@ cublasStatus_t cublasZgemvStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* x, int64_t incx, long long int stridex, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy, long long int stridey, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemvStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16398,11 +25640,43 @@ cublasStatus_t cublasZgemvStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHSHgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __half* A, int lda, long long int strideA, const __half* x, int incx, long long int stridex, const float* beta, __half* y, int incy, long long int stridey, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHSHgemvStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16425,11 +25699,43 @@ cublasStatus_t cublasHSHgemvStridedBatched(cublasHandle_t handle, cublasOperatio
         rpc_read(0, y, sizeof(__half)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHSHgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __half* A, int64_t lda, long long int strideA, const __half* x, int64_t incx, long long int stridex, const float* beta, __half* y, int64_t incy, long long int stridey, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHSHgemvStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16452,11 +25758,43 @@ cublasStatus_t cublasHSHgemvStridedBatched_64(cublasHandle_t handle, cublasOpera
         rpc_read(0, y, sizeof(__half)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHSSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __half* A, int lda, long long int strideA, const __half* x, int incx, long long int stridex, const float* beta, float* y, int incy, long long int stridey, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHSSgemvStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16479,11 +25817,43 @@ cublasStatus_t cublasHSSgemvStridedBatched(cublasHandle_t handle, cublasOperatio
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHSSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __half* A, int64_t lda, long long int strideA, const __half* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHSSgemvStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16506,11 +25876,43 @@ cublasStatus_t cublasHSSgemvStridedBatched_64(cublasHandle_t handle, cublasOpera
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasTSTgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __nv_bfloat16* A, int lda, long long int strideA, const __nv_bfloat16* x, int incx, long long int stridex, const float* beta, __nv_bfloat16* y, int incy, long long int stridey, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasTSTgemvStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16533,11 +25935,43 @@ cublasStatus_t cublasTSTgemvStridedBatched(cublasHandle_t handle, cublasOperatio
         rpc_read(0, y, sizeof(__nv_bfloat16)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasTSTgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __nv_bfloat16* A, int64_t lda, long long int strideA, const __nv_bfloat16* x, int64_t incx, long long int stridex, const float* beta, __nv_bfloat16* y, int64_t incy, long long int stridey, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasTSTgemvStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16560,11 +25994,43 @@ cublasStatus_t cublasTSTgemvStridedBatched_64(cublasHandle_t handle, cublasOpera
         rpc_read(0, y, sizeof(__nv_bfloat16)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasTSSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __nv_bfloat16* A, int lda, long long int strideA, const __nv_bfloat16* x, int incx, long long int stridex, const float* beta, float* y, int incy, long long int stridey, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasTSSgemvStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16587,11 +26053,43 @@ cublasStatus_t cublasTSSgemvStridedBatched(cublasHandle_t handle, cublasOperatio
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasTSSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __nv_bfloat16* A, int64_t lda, long long int strideA, const __nv_bfloat16* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasTSSgemvStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16614,11 +26112,41 @@ cublasStatus_t cublasTSSgemvStridedBatched_64(cublasHandle_t handle, cublasOpera
         rpc_read(0, y, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16640,11 +26168,39 @@ cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, const float* beta, float* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16665,11 +26221,39 @@ cublasStatus_t cublasSgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16690,11 +26274,39 @@ cublasStatus_t cublasDgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, const double* beta, double* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16715,11 +26327,39 @@ cublasStatus_t cublasDgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16740,11 +26380,39 @@ cublasStatus_t cublasCgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16765,11 +26433,39 @@ cublasStatus_t cublasCgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemm3m(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemm3m) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16790,11 +26486,39 @@ cublasStatus_t cublasCgemm3m(cublasHandle_t handle, cublasOperation_t transa, cu
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemm3m_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemm3m_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16815,11 +26539,39 @@ cublasStatus_t cublasCgemm3m_64(cublasHandle_t handle, cublasOperation_t transa,
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16840,11 +26592,39 @@ cublasStatus_t cublasZgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16865,11 +26645,39 @@ cublasStatus_t cublasZgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemm3m) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16890,11 +26698,39 @@ cublasStatus_t cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa, cu
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemm3m_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemm3m_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16915,11 +26751,39 @@ cublasStatus_t cublasZgemm3m_64(cublasHandle_t handle, cublasOperation_t transa,
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, const __half* B, int ldb, const __half* beta, __half* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHgemm) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16940,11 +26804,39 @@ cublasStatus_t cublasHgemm(cublasHandle_t handle, cublasOperation_t transa, cubl
         rpc_read(0, C, sizeof(__half)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHgemm_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* A, int64_t lda, const __half* B, int64_t ldb, const __half* beta, __half* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHgemm_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16965,11 +26857,36 @@ cublasStatus_t cublasHgemm_64(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_read(0, C, sizeof(__half)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyrk_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -16987,11 +26904,33 @@ cublasStatus_t cublasSsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* beta, float* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyrk_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17009,11 +26948,33 @@ cublasStatus_t cublasSsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double* alpha, const double* A, int lda, const double* beta, double* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyrk_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17031,11 +26992,33 @@ cublasStatus_t cublasDsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* beta, double* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyrk_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17053,11 +27036,33 @@ cublasStatus_t cublasDsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyrk_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17075,11 +27080,33 @@ cublasStatus_t cublasCsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyrk_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17097,11 +27124,33 @@ cublasStatus_t cublasCsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyrk_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17119,11 +27168,33 @@ cublasStatus_t cublasZsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyrk_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17141,11 +27212,33 @@ cublasStatus_t cublasZsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const cuComplex* A, int lda, const float* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCherk_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17163,11 +27256,33 @@ cublasStatus_t cublasCherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const float* alpha, const cuComplex* A, int64_t lda, const float* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCherk_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17185,11 +27300,33 @@ cublasStatus_t cublasCherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double* alpha, const cuDoubleComplex* A, int lda, const double* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZherk_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17207,11 +27344,33 @@ cublasStatus_t cublasZherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const double* alpha, const cuDoubleComplex* A, int64_t lda, const double* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZherk_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17229,11 +27388,35 @@ cublasStatus_t cublasZherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyr2k_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17253,11 +27436,37 @@ cublasStatus_t cublasSsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, const float* beta, float* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyr2k_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17277,11 +27486,37 @@ cublasStatus_t cublasSsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyr2k_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17301,11 +27536,37 @@ cublasStatus_t cublasDsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, const double* beta, double* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyr2k_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17325,11 +27586,37 @@ cublasStatus_t cublasDsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyr2k_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17349,11 +27636,37 @@ cublasStatus_t cublasCsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyr2k_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17373,11 +27686,37 @@ cublasStatus_t cublasCsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyr2k_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17397,11 +27736,37 @@ cublasStatus_t cublasZsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyr2k_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17421,11 +27786,37 @@ cublasStatus_t cublasZsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const float* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCher2k_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17445,11 +27836,37 @@ cublasStatus_t cublasCher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const float* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCher2k_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17469,11 +27886,37 @@ cublasStatus_t cublasCher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const double* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZher2k_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17493,11 +27936,37 @@ cublasStatus_t cublasZher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const double* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZher2k_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17517,11 +27986,37 @@ cublasStatus_t cublasZher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyrkx) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17541,11 +28036,37 @@ cublasStatus_t cublasSsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, const float* beta, float* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsyrkx_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17565,11 +28086,37 @@ cublasStatus_t cublasSsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyrkx) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17589,11 +28136,37 @@ cublasStatus_t cublasDsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, const double* beta, double* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsyrkx_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17613,11 +28186,37 @@ cublasStatus_t cublasDsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyrkx) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17637,11 +28236,37 @@ cublasStatus_t cublasCsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsyrkx_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17661,11 +28286,37 @@ cublasStatus_t cublasCsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyrkx) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17685,11 +28336,37 @@ cublasStatus_t cublasZsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsyrkx_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17709,11 +28386,37 @@ cublasStatus_t cublasZsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const float* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCherkx) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17733,11 +28436,37 @@ cublasStatus_t cublasCherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const float* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCherkx_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17757,11 +28486,37 @@ cublasStatus_t cublasCherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const double* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZherkx) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17781,11 +28536,37 @@ cublasStatus_t cublasZherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const double* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZherkx_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17805,11 +28586,37 @@ cublasStatus_t cublasZherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsymm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17829,11 +28636,37 @@ cublasStatus_t cublasSsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, const float* beta, float* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSsymm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17853,11 +28686,37 @@ cublasStatus_t cublasSsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsymm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17877,11 +28736,37 @@ cublasStatus_t cublasDsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, const double* beta, double* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDsymm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17901,11 +28786,37 @@ cublasStatus_t cublasDsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsymm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17925,11 +28836,37 @@ cublasStatus_t cublasCsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCsymm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17949,11 +28886,37 @@ cublasStatus_t cublasCsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsymm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17973,11 +28936,37 @@ cublasStatus_t cublasZsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZsymm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -17997,11 +28986,37 @@ cublasStatus_t cublasZsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChemm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChemm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18021,11 +29036,37 @@ cublasStatus_t cublasChemm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasChemm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasChemm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18045,11 +29086,37 @@ cublasStatus_t cublasChemm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhemm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhemm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18069,11 +29136,37 @@ cublasStatus_t cublasZhemm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZhemm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZhemm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18093,11 +29186,36 @@ cublasStatus_t cublasZhemm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* A, int lda, float* B, int ldb)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrsm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18116,11 +29234,35 @@ cublasStatus_t cublasStrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, B, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, float* B, int64_t ldb)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrsm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18139,11 +29281,35 @@ cublasStatus_t cublasStrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, B, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* A, int lda, double* B, int ldb)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrsm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18162,11 +29328,35 @@ cublasStatus_t cublasDtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, B, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, double* B, int64_t ldb)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrsm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18185,11 +29375,35 @@ cublasStatus_t cublasDtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, B, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, cuComplex* B, int ldb)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrsm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18208,11 +29422,35 @@ cublasStatus_t cublasCtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, B, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, cuComplex* B, int64_t ldb)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrsm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18231,11 +29469,35 @@ cublasStatus_t cublasCtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, B, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, cuDoubleComplex* B, int ldb)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrsm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18254,11 +29516,35 @@ cublasStatus_t cublasZtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, B, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* B, int64_t ldb)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrsm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18277,11 +29563,37 @@ cublasStatus_t cublasZtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, B, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* A, int lda, const float* B, int ldb, float* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrmm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18302,11 +29614,39 @@ cublasStatus_t cublasStrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, float* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrmm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18327,11 +29667,39 @@ cublasStatus_t cublasStrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* A, int lda, const double* B, int ldb, double* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrmm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18352,11 +29720,39 @@ cublasStatus_t cublasDtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, double* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrmm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18377,11 +29773,39 @@ cublasStatus_t cublasDtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrmm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18402,11 +29826,39 @@ cublasStatus_t cublasCtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrmm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18427,11 +29879,39 @@ cublasStatus_t cublasCtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrmm_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18452,11 +29932,39 @@ cublasStatus_t cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrmm_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18477,11 +29985,46 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* const Aarray[], int lda, const __half* const Barray[], int ldb, const __half* beta, __half* const Carray[], int ldc, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHgemmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -18502,11 +30045,53 @@ cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* const Aarray[], int64_t lda, const __half* const Barray[], int64_t ldb, const __half* beta, __half* const Carray[], int64_t ldc, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHgemmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -18527,11 +30112,53 @@ cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* const Aarray[], int lda, const float* const Barray[], int ldb, const float* beta, float* const Carray[], int ldc, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -18552,11 +30179,53 @@ cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* const Aarray[], int64_t lda, const float* const Barray[], int64_t ldb, const float* beta, float* const Carray[], int64_t ldc, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -18577,12 +30246,53 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount)
 {
-    cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -18605,12 +30315,53 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
-    cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* const Aarray[], int64_t lda, const double* const Barray[], int64_t ldb, const double* beta, double* const Carray[], int64_t ldc, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -18633,11 +30384,53 @@ cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -18660,11 +30453,53 @@ cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -18687,11 +30522,53 @@ cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemm3mBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -18714,11 +30591,53 @@ cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t tra
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemm3mBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -18741,11 +30660,53 @@ cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const Barray[], int ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int ldc, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -18768,11 +30729,53 @@ cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int64_t lda, const cuDoubleComplex* const Barray[], int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int64_t ldc, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -18795,11 +30798,50 @@ cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, long long int strideA, const __half* B, int ldb, long long int strideB, const __half* beta, __half* C, int ldc, long long int strideC, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHgemmStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18826,11 +30868,47 @@ cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, C, sizeof(__half)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasHgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* A, int64_t lda, long long int strideA, const __half* B, int64_t ldb, long long int strideB, const __half* beta, __half* C, int64_t ldc, long long int strideC, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasHgemmStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18857,11 +30935,47 @@ cublasStatus_t cublasHgemmStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, C, sizeof(__half)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, long long int strideA, const float* B, int ldb, long long int strideB, const float* beta, float* C, int ldc, long long int strideC, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemmStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18888,11 +31002,47 @@ cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, long long int strideA, const float* B, int64_t ldb, long long int strideB, const float* beta, float* C, int64_t ldc, long long int strideC, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgemmStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18919,11 +31069,47 @@ cublasStatus_t cublasSgemmStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* A, int lda, long long int strideA, const double* B, int ldb, long long int strideB, const double* beta, double* C, int ldc, long long int strideC, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemmStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18950,11 +31136,47 @@ cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, long long int strideA, const double* B, int64_t ldb, long long int strideB, const double* beta, double* C, int64_t ldc, long long int strideC, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemmStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -18981,11 +31203,47 @@ cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* B, int ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int ldc, long long int strideC, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemmStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19012,11 +31270,47 @@ cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* B, int64_t ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int64_t ldc, long long int strideC, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemmStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19043,11 +31337,47 @@ cublasStatus_t cublasCgemmStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* B, int ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int ldc, long long int strideC, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19074,11 +31404,47 @@ cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle, cublasOperatio
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgemm3mStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* B, int64_t ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int64_t ldc, long long int strideC, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19105,11 +31471,47 @@ cublasStatus_t cublasCgemm3mStridedBatched_64(cublasHandle_t handle, cublasOpera
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, long long int strideA, const cuDoubleComplex* B, int ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc, long long int strideC, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemmStridedBatched) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19136,11 +31538,47 @@ cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* B, int64_t ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc, long long int strideC, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgemmStridedBatched_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19167,11 +31605,55 @@ cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int64_t lda, const void* const Barray[], cudaDataType Btype, int64_t ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int64_t ldc, int64_t batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Atype, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Btype, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&Ctype, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&computeType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasGemmBatchedEx_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -19199,11 +31681,50 @@ cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Atype, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Btype, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&Ctype, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&computeType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgeam) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19225,11 +31746,37 @@ cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cubl
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* beta, const float* B, int64_t ldb, float* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgeam_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19251,11 +31798,37 @@ cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgeam) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19277,11 +31850,37 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cubl
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* beta, const double* B, int64_t ldb, double* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgeam_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19303,11 +31902,37 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* beta, const cuComplex* B, int ldb, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgeam) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19329,11 +31954,37 @@ cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, cubl
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* beta, const cuComplex* B, int64_t ldb, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgeam_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19355,11 +32006,37 @@ cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int ldb, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgeam) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19381,11 +32058,37 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cubl
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgeam_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19407,11 +32110,41 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* const A[], int lda, float* const B[], int ldb, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrsmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -19431,11 +32164,45 @@ cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* const A[], int64_t lda, float* const B[], int64_t ldb, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrsmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -19455,11 +32222,45 @@ cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* const A[], int lda, double* const B[], int ldb, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrsmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -19479,11 +32280,45 @@ cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* const A[], int64_t lda, double* const B[], int64_t ldb, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrsmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -19503,11 +32338,45 @@ cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* const A[], int lda, cuComplex* const B[], int ldb, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrsmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -19527,11 +32396,45 @@ cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* const A[], int64_t lda, cuComplex* const B[], int64_t ldb, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrsmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -19551,11 +32454,45 @@ cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const B[], int ldb, int batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrsmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -19575,11 +32512,45 @@ cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int64_t lda, cuDoubleComplex* const B[], int64_t ldb, int64_t batchCount)
 {
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrsmBatched_64) < 0 ||
         rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
@@ -19599,11 +32570,38 @@ cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchCount); i++)
+       maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const float* A, int lda, const float* x, int incx, float* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSdgmm) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19621,11 +32619,31 @@ cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const float* A, int64_t lda, const float* x, int64_t incx, float* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSdgmm_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19643,11 +32661,31 @@ cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6
         rpc_read(0, C, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const double* A, int lda, const double* x, int incx, double* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDdgmm) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19665,11 +32703,31 @@ cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const double* A, int64_t lda, const double* x, int64_t incx, double* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDdgmm_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19687,11 +32745,31 @@ cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6
         rpc_read(0, C, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const cuComplex* A, int lda, const cuComplex* x, int incx, cuComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCdgmm) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19709,11 +32787,31 @@ cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, cuComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCdgmm_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19731,11 +32829,31 @@ cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6
         rpc_read(0, C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, cuDoubleComplex* C, int ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdgmm) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19753,11 +32871,31 @@ cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* C, int64_t ldc)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZdgmm_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -19775,11 +32913,33 @@ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6
         rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* const A[], int lda, float* const Ainv[], int lda_inv, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSmatinvBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19794,11 +32954,35 @@ cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* c
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* const A[], int lda, double* const Ainv[], int lda_inv, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDmatinvBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19813,11 +32997,35 @@ cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double*
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, cuComplex* const Ainv[], int lda_inv, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCmatinvBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19832,11 +33040,35 @@ cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComple
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const Ainv[], int lda_inv, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZmatinvBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19851,11 +33083,35 @@ cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDouble
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* const Aarray[], int lda, float* const TauArray[], int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgeqrfBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19870,11 +33126,35 @@ cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* c
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* const Aarray[], int lda, double* const TauArray[], int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgeqrfBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19889,11 +33169,35 @@ cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double*
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComplex* const Aarray[], int lda, cuComplex* const TauArray[], int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgeqrfBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19908,11 +33212,35 @@ cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComple
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const TauArray[], int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgeqrfBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19927,11 +33255,39 @@ cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDouble
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, float* const Aarray[], int lda, float* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgelsBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19951,11 +33307,43 @@ cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, double* const Aarray[], int lda, double* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgelsBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19975,11 +33363,43 @@ cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuComplex* const Aarray[], int lda, cuComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgelsBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -19999,11 +33419,43 @@ cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgelsBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20023,11 +33475,33 @@ cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStpttr) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20041,11 +33515,23 @@ cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_read(0, A, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* AP, double* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtpttr) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20059,11 +33545,23 @@ cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_read(0, A, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* AP, cuComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtpttr) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20077,11 +33575,23 @@ cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_read(0, A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* AP, cuDoubleComplex* A, int lda)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtpttr) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20095,11 +33605,23 @@ cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* A, int lda, float* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasStrttp) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20113,11 +33635,23 @@ cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_read(0, AP, sizeof(float)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* A, int lda, double* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDtrttp) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20131,11 +33665,23 @@ cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_read(0, AP, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* A, int lda, cuComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCtrttp) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20149,11 +33695,23 @@ cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_read(0, AP, sizeof(cuComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* AP)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZtrttp) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20167,11 +33725,30 @@ cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* const A[], int lda, const int* P, float* const C[], int ldc, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgetriBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20188,11 +33765,37 @@ cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* co
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* const A[], int lda, const int* P, double* const C[], int ldc, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgetriBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20209,11 +33812,37 @@ cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* c
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, const int* P, cuComplex* const C[], int ldc, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgetriBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20230,11 +33859,37 @@ cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, const int* P, cuDoubleComplex* const C[], int ldc, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgetriBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20251,11 +33906,39 @@ cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleC
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float* const Aarray[], int lda, const int* devIpiv, float* const Barray[], int ldb, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasSgetrsBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20274,11 +33957,41 @@ cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t tran
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const double* const Aarray[], int lda, const int* devIpiv, double* const Barray[], int ldb, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgetrsBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20297,11 +34010,41 @@ cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t tran
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuComplex* const Aarray[], int lda, const int* devIpiv, cuComplex* const Barray[], int ldb, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasCgetrsBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20319,11 +34062,41 @@ cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t tran
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuDoubleComplex* const Aarray[], int lda, const int* devIpiv, cuDoubleComplex* const Barray[], int ldb, int* info, int batchSize)
 {
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasZgetrsBatched) < 0 ||
         rpc_write(0, &batchSize, sizeof(int)) < 0 ||
@@ -20342,11 +34115,44 @@ cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t tran
         rpc_read(0, info, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < static_cast<int>(batchSize); i++)
+       maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cublasStatus_t cublasUint8gemmBias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc, int m, int n, int k, const unsigned char* A, int A_bias, int lda, const unsigned char* B, int B_bias, int ldb, unsigned char* C, int C_bias, int ldc, int C_mult, int C_shift)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&transc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&A_bias, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&B_bias, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&C_bias, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&C_mult, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&C_shift, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasUint8gemmBias) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -20371,11 +34177,31 @@ cublasStatus_t cublasUint8gemmBias(cublasHandle_t handle, cublasOperation_t tran
         rpc_read(0, C, sizeof(unsigned char)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&transc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&A_bias, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&B_bias, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&C_bias, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&C_mult, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&C_shift, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int* value)
 {
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetProperty) < 0 ||
         rpc_write(0, &type, sizeof(libraryPropertyType)) < 0 ||
@@ -20384,33 +34210,41 @@ cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int* value)
         rpc_read(0, value, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreate(cudnnHandle_t* handle)
 {
+    maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreate) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, handle, sizeof(cudnnHandle_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroy) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&streamId, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetStream) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -20418,11 +34252,15 @@ cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId)
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&streamId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t* streamId)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetStream) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -20431,11 +34269,16 @@ cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t* streamId)
         rpc_read(0, streamId, sizeof(cudaStream_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetCallback(unsigned* mask, void** udata, cudnnCallback_t* fptr)
 {
+    maybe_copy_unified_arg(0, (void*)mask, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)udata, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)fptr, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetCallback) < 0 ||
         rpc_write(0, mask, sizeof(unsigned)) < 0 ||
@@ -20447,6 +34290,9 @@ cudnnStatus_t cudnnGetCallback(unsigned* mask, void** udata, cudnnCallback_t* fp
         rpc_read(0, fptr, sizeof(cudnnCallback_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)mask, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)udata, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)fptr, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -20462,6 +34308,8 @@ cudnnStatus_t cudnnGraphVersionCheck()
 
 cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t* descriptor)
 {
+    maybe_copy_unified_arg(0, (void*)&descriptorType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)descriptor, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnBackendCreateDescriptor) < 0 ||
         rpc_write(0, &descriptorType, sizeof(cudnnBackendDescriptorType_t)) < 0 ||
@@ -20470,44 +34318,57 @@ cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descript
         rpc_read(0, descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&descriptorType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)descriptor, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor)
 {
+    maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnBackendDestroyDescriptor) < 0 ||
         rpc_write(0, &descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor)
 {
+    maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnBackendInitialize) < 0 ||
         rpc_write(0, &descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor)
 {
+    maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnBackendFinalize) < 0 ||
         rpc_write(0, &descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor, cudnnBackendAttributeName_t attributeName, cudnnBackendAttributeType_t attributeType, int64_t elementCount, const void* arrayOfElements)
 {
+    maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attributeName, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&attributeType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&elementCount, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)arrayOfElements, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnBackendSetAttribute) < 0 ||
         rpc_write(0, &descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 ||
@@ -20518,11 +34379,19 @@ cudnnStatus_t cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor, cudn
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attributeName, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&attributeType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&elementCount, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)arrayOfElements, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnBackendExecute) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -20531,11 +34400,18 @@ cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t graph)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnBackendPopulateCudaGraph) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -20545,11 +34421,19 @@ cudnnStatus_t cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, cudnnBackendDe
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t graph)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnBackendUpdateCudaGraph) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -20559,11 +34443,16 @@ cudnnStatus_t cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, cudnnBackendDesc
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t* tensorDesc)
 {
+    maybe_copy_unified_arg(0, (void*)tensorDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateTensorDescriptor) < 0 ||
         rpc_write(0, tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 ||
@@ -20571,11 +34460,19 @@ cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t* tensorDesc)
         rpc_read(0, tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)tensorDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w)
 {
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetTensor4dDescriptor) < 0 ||
         rpc_write(0, &tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 ||
@@ -20588,11 +34485,28 @@ cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cud
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride)
 {
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&nStride, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&cStride, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&hStride, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&wStride, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetTensor4dDescriptorEx) < 0 ||
         rpc_write(0, &tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 ||
@@ -20608,11 +34522,31 @@ cudnnStatus_t cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, c
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&nStride, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&cStride, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&hStride, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&wStride, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t* dataType, int* n, int* c, int* h, int* w, int* nStride, int* cStride, int* hStride, int* wStride)
 {
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dataType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)h, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)w, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)nStride, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)cStride, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)hStride, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)wStride, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetTensor4dDescriptor) < 0 ||
         rpc_write(0, &tensorDesc, sizeof(const cudnnTensorDescriptor_t)) < 0 ||
@@ -20637,11 +34571,23 @@ cudnnStatus_t cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDes
         rpc_read(0, wStride, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dataType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)h, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)w, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)nStride, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)cStride, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)hStride, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)wStride, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t* size)
 {
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetTensorSizeInBytes) < 0 ||
         rpc_write(0, &tensorDesc, sizeof(const cudnnTensorDescriptor_t)) < 0 ||
@@ -20650,22 +34596,30 @@ cudnnStatus_t cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc
         rpc_read(0, size, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyTensorDescriptor) < 0 ||
         rpc_write(0, &tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc, const cudnnTensorDescriptor_t srcDesc, cudnnTensorDescriptor_t destDesc, size_t* destSizeInBytes)
 {
+    maybe_copy_unified_arg(0, (void*)&transformDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&srcDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&destDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)destSizeInBytes, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnInitTransformDest) < 0 ||
         rpc_write(0, &transformDesc, sizeof(const cudnnTensorTransformDescriptor_t)) < 0 ||
@@ -20676,11 +34630,16 @@ cudnnStatus_t cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t tran
         rpc_read(0, destSizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&transformDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&srcDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&destDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)destSizeInBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t* transformDesc)
 {
+    maybe_copy_unified_arg(0, (void*)transformDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateTensorTransformDescriptor) < 0 ||
         rpc_write(0, transformDesc, sizeof(cudnnTensorTransformDescriptor_t)) < 0 ||
@@ -20688,22 +34647,26 @@ cudnnStatus_t cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescripto
         rpc_read(0, transformDesc, sizeof(cudnnTensorTransformDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)transformDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&transformDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyTensorTransformDescriptor) < 0 ||
         rpc_write(0, &transformDesc, sizeof(cudnnTensorTransformDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&transformDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t* opTensorDesc)
 {
+    maybe_copy_unified_arg(0, (void*)opTensorDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateOpTensorDescriptor) < 0 ||
         rpc_write(0, opTensorDesc, sizeof(cudnnOpTensorDescriptor_t)) < 0 ||
@@ -20711,11 +34674,16 @@ cudnnStatus_t cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t* opTensorD
         rpc_read(0, opTensorDesc, sizeof(cudnnOpTensorDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)opTensorDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp, cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt)
 {
+    maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&opTensorOp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&opTensorCompType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&opTensorNanOpt, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetOpTensorDescriptor) < 0 ||
         rpc_write(0, &opTensorDesc, sizeof(cudnnOpTensorDescriptor_t)) < 0 ||
@@ -20725,11 +34693,19 @@ cudnnStatus_t cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&opTensorOp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&opTensorCompType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&opTensorNanOpt, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t* opTensorOp, cudnnDataType_t* opTensorCompType, cudnnNanPropagation_t* opTensorNanOpt)
 {
+    maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)opTensorOp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)opTensorCompType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)opTensorNanOpt, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetOpTensorDescriptor) < 0 ||
         rpc_write(0, &opTensorDesc, sizeof(const cudnnOpTensorDescriptor_t)) < 0 ||
@@ -20742,22 +34718,29 @@ cudnnStatus_t cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTenso
         rpc_read(0, opTensorNanOpt, sizeof(cudnnNanPropagation_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)opTensorOp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)opTensorCompType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)opTensorNanOpt, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyOpTensorDescriptor) < 0 ||
         rpc_write(0, &opTensorDesc, sizeof(cudnnOpTensorDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t* reduceTensorDesc)
 {
+    maybe_copy_unified_arg(0, (void*)reduceTensorDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateReduceTensorDescriptor) < 0 ||
         rpc_write(0, reduceTensorDesc, sizeof(cudnnReduceTensorDescriptor_t)) < 0 ||
@@ -20765,11 +34748,18 @@ cudnnStatus_t cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t* r
         rpc_read(0, reduceTensorDesc, sizeof(cudnnReduceTensorDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)reduceTensorDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc, cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType, cudnnNanPropagation_t reduceTensorNanOpt, cudnnReduceTensorIndices_t reduceTensorIndices, cudnnIndicesType_t reduceTensorIndicesType)
 {
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorOp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorCompType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorNanOpt, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorIndices, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorIndicesType, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetReduceTensorDescriptor) < 0 ||
         rpc_write(0, &reduceTensorDesc, sizeof(cudnnReduceTensorDescriptor_t)) < 0 ||
@@ -20781,11 +34771,23 @@ cudnnStatus_t cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduc
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorOp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorCompType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorNanOpt, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorIndices, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorIndicesType, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc, cudnnReduceTensorOp_t* reduceTensorOp, cudnnDataType_t* reduceTensorCompType, cudnnNanPropagation_t* reduceTensorNanOpt, cudnnReduceTensorIndices_t* reduceTensorIndices, cudnnIndicesType_t* reduceTensorIndicesType)
 {
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)reduceTensorOp, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)reduceTensorCompType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)reduceTensorNanOpt, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)reduceTensorIndices, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)reduceTensorIndicesType, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetReduceTensorDescriptor) < 0 ||
         rpc_write(0, &reduceTensorDesc, sizeof(const cudnnReduceTensorDescriptor_t)) < 0 ||
@@ -20802,22 +34804,35 @@ cudnnStatus_t cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t
         rpc_read(0, reduceTensorIndicesType, sizeof(cudnnIndicesType_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)reduceTensorOp, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)reduceTensorCompType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)reduceTensorNanOpt, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)reduceTensorIndices, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)reduceTensorIndicesType, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyReduceTensorDescriptor) < 0 ||
         rpc_write(0, &reduceTensorDesc, sizeof(cudnnReduceTensorDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetReductionIndicesSize(cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc, const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc, size_t* sizeInBytes)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&aDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&cDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetReductionIndicesSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -20829,11 +34844,21 @@ cudnnStatus_t cudnnGetReductionIndicesSize(cudnnHandle_t handle, const cudnnRedu
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&aDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&cDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetReductionWorkspaceSize(cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc, const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc, size_t* sizeInBytes)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&aDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&cDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetReductionWorkspaceSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -20845,11 +34870,17 @@ cudnnStatus_t cudnnGetReductionWorkspaceSize(cudnnHandle_t handle, const cudnnRe
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&aDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&cDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t* filterDesc)
 {
+    maybe_copy_unified_arg(0, (void*)filterDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateFilterDescriptor) < 0 ||
         rpc_write(0, filterDesc, sizeof(cudnnFilterDescriptor_t)) < 0 ||
@@ -20857,11 +34888,19 @@ cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t* filterDesc)
         rpc_read(0, filterDesc, sizeof(cudnnFilterDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)filterDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w)
 {
+    maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetFilter4dDescriptor) < 0 ||
         rpc_write(0, &filterDesc, sizeof(cudnnFilterDescriptor_t)) < 0 ||
@@ -20874,11 +34913,25 @@ cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cud
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, cudnnDataType_t* dataType, cudnnTensorFormat_t* format, int* k, int* c, int* h, int* w)
 {
+    maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dataType, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)format, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)k, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)h, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)w, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetFilter4dDescriptor) < 0 ||
         rpc_write(0, &filterDesc, sizeof(const cudnnFilterDescriptor_t)) < 0 ||
@@ -20897,11 +34950,20 @@ cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDes
         rpc_read(0, w, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dataType, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)format, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)k, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)h, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)w, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t* size)
 {
+    maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetFilterSizeInBytes) < 0 ||
         rpc_write(0, &filterDesc, sizeof(const cudnnFilterDescriptor_t)) < 0 ||
@@ -20910,22 +34972,27 @@ cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc
         rpc_read(0, size, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyFilterDescriptor) < 0 ||
         rpc_write(0, &filterDesc, sizeof(cudnnFilterDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t* poolingDesc)
 {
+    maybe_copy_unified_arg(0, (void*)poolingDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreatePoolingDescriptor) < 0 ||
         rpc_write(0, poolingDesc, sizeof(cudnnPoolingDescriptor_t)) < 0 ||
@@ -20933,11 +35000,21 @@ cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t* poolingDesc
         rpc_read(0, poolingDesc, sizeof(cudnnPoolingDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)poolingDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode, cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride)
 {
+    maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&maxpoolingNanOpt, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&windowHeight, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&windowWidth, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&verticalPadding, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&horizontalPadding, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&verticalStride, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&horizontalStride, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetPooling2dDescriptor) < 0 ||
         rpc_write(0, &poolingDesc, sizeof(cudnnPoolingDescriptor_t)) < 0 ||
@@ -20952,11 +35029,29 @@ cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&maxpoolingNanOpt, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&windowHeight, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&windowWidth, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&verticalPadding, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&horizontalPadding, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&verticalStride, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&horizontalStride, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t* mode, cudnnNanPropagation_t* maxpoolingNanOpt, int* windowHeight, int* windowWidth, int* verticalPadding, int* horizontalPadding, int* verticalStride, int* horizontalStride)
 {
+    maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)maxpoolingNanOpt, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)windowHeight, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)windowWidth, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)verticalPadding, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)horizontalPadding, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)verticalStride, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)horizontalStride, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetPooling2dDescriptor) < 0 ||
         rpc_write(0, &poolingDesc, sizeof(const cudnnPoolingDescriptor_t)) < 0 ||
@@ -20979,11 +35074,26 @@ cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t pooling
         rpc_read(0, horizontalStride, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)maxpoolingNanOpt, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)windowHeight, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)windowWidth, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)verticalPadding, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)horizontalPadding, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)verticalStride, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)horizontalStride, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int* n, int* c, int* h, int* w)
 {
+    maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&inputTensorDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)n, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)h, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)w, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetPooling2dForwardOutputDim) < 0 ||
         rpc_write(0, &poolingDesc, sizeof(const cudnnPoolingDescriptor_t)) < 0 ||
@@ -20999,22 +35109,31 @@ cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t p
         rpc_read(0, w, sizeof(int)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&inputTensorDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)n, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)h, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)w, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyPoolingDescriptor) < 0 ||
         rpc_write(0, &poolingDesc, sizeof(cudnnPoolingDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t* activationDesc)
 {
+    maybe_copy_unified_arg(0, (void*)activationDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateActivationDescriptor) < 0 ||
         rpc_write(0, activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 ||
@@ -21022,11 +35141,16 @@ cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t* activ
         rpc_read(0, activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)activationDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode, cudnnNanPropagation_t reluNanOpt, double coef)
 {
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&reluNanOpt, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&coef, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetActivationDescriptor) < 0 ||
         rpc_write(0, &activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 ||
@@ -21036,11 +35160,19 @@ cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activatio
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&reluNanOpt, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&coef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t* mode, cudnnNanPropagation_t* reluNanOpt, double* coef)
 {
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)reluNanOpt, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)coef, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetActivationDescriptor) < 0 ||
         rpc_write(0, &activationDesc, sizeof(const cudnnActivationDescriptor_t)) < 0 ||
@@ -21053,11 +35185,17 @@ cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t act
         rpc_read(0, coef, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)reluNanOpt, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)coef, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta)
 {
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&swish_beta, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetActivationDescriptorSwishBeta) < 0 ||
         rpc_write(0, &activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 ||
@@ -21065,11 +35203,15 @@ cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&swish_beta, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double* swish_beta)
 {
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)swish_beta, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetActivationDescriptorSwishBeta) < 0 ||
         rpc_write(0, &activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 ||
@@ -21078,22 +35220,34 @@ cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t
         rpc_read(0, swish_beta, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)swish_beta, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyActivationDescriptor) < 0 ||
         rpc_write(0, &activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnActivationForward(cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc, const void* alpha, const cudnnTensorDescriptor_t xDesc, const void* x, const void* beta, const cudnnTensorDescriptor_t yDesc, void* y)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnActivationForward) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -21109,11 +35263,20 @@ cudnnStatus_t cudnnActivationForward(cudnnHandle_t handle, cudnnActivationDescri
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t* normDesc)
 {
+    maybe_copy_unified_arg(0, (void*)normDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateLRNDescriptor) < 0 ||
         rpc_write(0, normDesc, sizeof(cudnnLRNDescriptor_t)) < 0 ||
@@ -21121,11 +35284,17 @@ cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t* normDesc)
         rpc_read(0, normDesc, sizeof(cudnnLRNDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)normDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK)
 {
+    maybe_copy_unified_arg(0, (void*)&normDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lrnN, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lrnAlpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lrnBeta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&lrnK, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnSetLRNDescriptor) < 0 ||
         rpc_write(0, &normDesc, sizeof(cudnnLRNDescriptor_t)) < 0 ||
@@ -21136,11 +35305,21 @@ cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&normDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lrnN, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lrnAlpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lrnBeta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&lrnK, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned* lrnN, double* lrnAlpha, double* lrnBeta, double* lrnK)
 {
+    maybe_copy_unified_arg(0, (void*)&normDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)lrnN, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)lrnAlpha, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)lrnBeta, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)lrnK, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetLRNDescriptor) < 0 ||
         rpc_write(0, &normDesc, sizeof(cudnnLRNDescriptor_t)) < 0 ||
@@ -21155,22 +35334,32 @@ cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned* lrn
         rpc_read(0, lrnK, sizeof(double)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&normDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)lrnN, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)lrnAlpha, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)lrnBeta, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)lrnK, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&lrnDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyLRNDescriptor) < 0 ||
         rpc_write(0, &lrnDesc, sizeof(cudnnLRNDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&lrnDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc, cudnnBatchNormMode_t mode)
 {
+    maybe_copy_unified_arg(0, (void*)&derivedBnDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDeriveBNTensorDescriptor) < 0 ||
         rpc_write(0, &derivedBnDesc, sizeof(cudnnTensorDescriptor_t)) < 0 ||
@@ -21179,11 +35368,19 @@ cudnnStatus_t cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDes
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&derivedBnDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc, cudnnTensorDescriptor_t derivedNormMeanVarDesc, const cudnnTensorDescriptor_t xDesc, cudnnNormMode_t mode, int groupCnt)
 {
+    maybe_copy_unified_arg(0, (void*)&derivedNormScaleBiasDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&derivedNormMeanVarDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDeriveNormTensorDescriptor) < 0 ||
         rpc_write(0, &derivedNormScaleBiasDesc, sizeof(cudnnTensorDescriptor_t)) < 0 ||
@@ -21194,11 +35391,17 @@ cudnnStatus_t cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNor
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&derivedNormScaleBiasDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&derivedNormMeanVarDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t* stDesc)
 {
+    maybe_copy_unified_arg(0, (void*)stDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateSpatialTransformerDescriptor) < 0 ||
         rpc_write(0, stDesc, sizeof(cudnnSpatialTransformerDescriptor_t)) < 0 ||
@@ -21206,22 +35409,26 @@ cudnnStatus_t cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDes
         rpc_read(0, stDesc, sizeof(cudnnSpatialTransformerDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)stDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&stDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroySpatialTransformerDescriptor) < 0 ||
         rpc_write(0, &stDesc, sizeof(cudnnSpatialTransformerDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&stDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t* dropoutDesc)
 {
+    maybe_copy_unified_arg(0, (void*)dropoutDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnCreateDropoutDescriptor) < 0 ||
         rpc_write(0, dropoutDesc, sizeof(cudnnDropoutDescriptor_t)) < 0 ||
@@ -21229,22 +35436,27 @@ cudnnStatus_t cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t* dropoutDesc
         rpc_read(0, dropoutDesc, sizeof(cudnnDropoutDescriptor_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)dropoutDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc)
 {
+    maybe_copy_unified_arg(0, (void*)&dropoutDesc, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDestroyDropoutDescriptor) < 0 ||
         rpc_write(0, &dropoutDesc, sizeof(cudnnDropoutDescriptor_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&dropoutDesc, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t* sizeInBytes)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDropoutGetStatesSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -21253,11 +35465,15 @@ cudnnStatus_t cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t* sizeInByte
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t* sizeInBytes)
 {
+    maybe_copy_unified_arg(0, (void*)&xdesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnDropoutGetReserveSpaceSize) < 0 ||
         rpc_write(0, &xdesc, sizeof(cudnnTensorDescriptor_t)) < 0 ||
@@ -21266,11 +35482,18 @@ cudnnStatus_t cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, siz
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&xdesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float* dropout, void** states, unsigned long long* seed)
 {
+    maybe_copy_unified_arg(0, (void*)&dropoutDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)dropout, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)states, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)seed, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetDropoutDescriptor) < 0 ||
         rpc_write(0, &dropoutDesc, sizeof(cudnnDropoutDescriptor_t)) < 0 ||
@@ -21284,6 +35507,11 @@ cudnnStatus_t cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, cu
         rpc_read(0, seed, sizeof(unsigned long long)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&dropoutDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)dropout, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)states, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)seed, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
@@ -21299,6 +35527,15 @@ cudnnStatus_t cudnnOpsVersionCheck()
 
 cudnnStatus_t cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc, const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const cudnnActivationDescriptor_t activationDesc, size_t* sizeInBytes)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&zDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bnScaleBiasMeanVarDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -21314,11 +35551,31 @@ cudnnStatus_t cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHand
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&zDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bnScaleBiasMeanVarDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc, const cudnnTensorDescriptor_t dxDesc, const cudnnTensorDescriptor_t dBnScaleBiasDesc, const cudnnActivationDescriptor_t activationDesc, size_t* sizeInBytes)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dyDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dzDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dxDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dBnScaleBiasDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -21336,11 +35593,28 @@ cudnnStatus_t cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t ha
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dyDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dzDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dxDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dBnScaleBiasDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, const cudnnActivationDescriptor_t activationDesc, const cudnnTensorDescriptor_t xDesc, size_t* sizeInBytes)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -21353,11 +35627,29 @@ cudnnStatus_t cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle, cudnnNormMode_t mode, cudnnNormOps_t normOps, cudnnNormAlgo_t algo, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc, const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t normScaleBiasDesc, const cudnnActivationDescriptor_t activationDesc, const cudnnTensorDescriptor_t normMeanVarDesc, size_t* sizeInBytes, int groupCnt)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&zDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&normScaleBiasDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&normMeanVarDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -21376,11 +35668,37 @@ cudnnStatus_t cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t ha
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&zDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&normScaleBiasDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&normMeanVarDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle, cudnnNormMode_t mode, cudnnNormOps_t normOps, cudnnNormAlgo_t algo, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc, const cudnnTensorDescriptor_t dxDesc, const cudnnTensorDescriptor_t dNormScaleBiasDesc, const cudnnActivationDescriptor_t activationDesc, const cudnnTensorDescriptor_t normMeanVarDesc, size_t* sizeInBytes, int groupCnt)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dyDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dzDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dxDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&dNormScaleBiasDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&normMeanVarDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetNormalizationBackwardWorkspaceSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -21401,11 +35719,33 @@ cudnnStatus_t cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle, c
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dyDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dzDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dxDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&dNormScaleBiasDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&normMeanVarDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
 cudnnStatus_t cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle, cudnnNormMode_t mode, cudnnNormOps_t normOps, cudnnNormAlgo_t algo, const cudnnActivationDescriptor_t activationDesc, const cudnnTensorDescriptor_t xDesc, size_t* sizeInBytes, int groupCnt)
 {
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice);
+    maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyHostToDevice);
     cudnnStatus_t return_value;
     if (rpc_start_request(0, RPC_cudnnGetNormalizationTrainingReserveSpaceSize) < 0 ||
         rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 ||
@@ -21420,6 +35760,14 @@ cudnnStatus_t cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle
         rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUDNN_STATUS_NOT_INITIALIZED;
+    maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost);
+    maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyDeviceToHost);
     return return_value;
 }
 
diff --git a/local.sh b/local.sh
index 3df7c12..7758b54 100755
--- a/local.sh
+++ b/local.sh
@@ -234,6 +234,7 @@ build_tests() {
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_unified.cu -o cublas_unified.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cudnn_managed.cu -o cudnn_managed.o
 }
 
 run() {
diff --git a/test/cudnn_managed.cu b/test/cudnn_managed.cu
new file mode 100644
index 0000000..5cf6980
--- /dev/null
+++ b/test/cudnn_managed.cu
@@ -0,0 +1,83 @@
+#include <iostream>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+
+/**
+ * Minimal example to apply sigmoid activation on a tensor 
+ * using cuDNN with Unified Memory.
+ **/
+int main(int argc, char** argv)
+{
+    int numGPUs;
+    cudaGetDeviceCount(&numGPUs);
+    std::cout << "Found " << numGPUs << " GPUs." << std::endl;
+    cudaSetDevice(0); // use GPU0
+    int device;
+    struct cudaDeviceProp devProp;
+    cudaGetDevice(&device);
+    cudaGetDeviceProperties(&devProp, device);
+    std::cout << "Compute capability:" << devProp.major << "." << devProp.minor << std::endl;
+
+    cudnnHandle_t handle_;
+    cudnnCreate(&handle_);
+    std::cout << "Created cuDNN handle" << std::endl;
+
+    // Create the tensor descriptor
+    cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
+    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+
+    int n = 1, c = 1, h = 1, w = 10;
+    int NUM_ELEMENTS = n * c * h * w;
+    cudnnTensorDescriptor_t x_desc;
+
+    cudnnCreateTensorDescriptor(&x_desc);
+    cudnnSetTensor4dDescriptor(x_desc, format, dtype, n, c, h, w);
+
+    // Allocate unified memory for the tensor
+    float *x;
+    cudaMallocManaged(&x, NUM_ELEMENTS * sizeof(float));
+
+    // Initialize the tensor
+    for (int i = 0; i < NUM_ELEMENTS; i++) x[i] = i * 1.00f;
+
+    std::cout << "Original array: ";
+    for (int i = 0; i < NUM_ELEMENTS; i++) std::cout << x[i] << " ";
+    std::cout << std::endl;
+
+    // Synchronize to ensure data is accessible on the device
+    cudaDeviceSynchronize();
+
+    // Create activation function descriptor
+    float alpha[1] = {1};
+    float beta[1] = {0.0};
+    cudnnActivationDescriptor_t sigmoid_activation;
+    cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
+    cudnnNanPropagation_t prop = CUDNN_NOT_PROPAGATE_NAN;
+    cudnnCreateActivationDescriptor(&sigmoid_activation);
+    cudnnSetActivationDescriptor(sigmoid_activation, mode, prop, 0.0f);
+
+    cudnnActivationForward(
+        handle_,
+        sigmoid_activation,
+        alpha,
+        x_desc,
+        x,
+        beta,
+        x_desc,
+        x
+    );
+
+    // Synchronize to ensure data is updated on the host
+    cudaDeviceSynchronize();
+
+    cudnnDestroy(handle_);
+    std::cout << "Destroyed cuDNN handle." << std::endl;
+
+    std::cout << "New array: ";
+    for (int i = 0; i < NUM_ELEMENTS; i++) std::cout << x[i] << " ";
+    std::cout << std::endl;
+
+    cudaFree(x);
+
+    return 0;
+}