diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp
index 84f7c9254..97145241f 100644
--- a/lib/Transforms/InsertGPUAllocs.cpp
+++ b/lib/Transforms/InsertGPUAllocs.cpp
@@ -406,8 +406,10 @@ class InsertGPUAllocsPass final
             auto newAlloc = builder.create<mlir::memref::AllocOp>(
                 loc, alloc.getType(), alloc.getDynamicSizes(),
                 alloc.getSymbolOperands());
-            builder.create<mlir::memref::CopyOp>(loc, allocResult,
-                                                 newAlloc.getResult());
+            builder.create<mlir::gpu::MemcpyOp>(
+                loc, /*asyncToken*/ static_cast<mlir::Type>(nullptr),
+                /*asyncDependencies*/ std::nullopt, newAlloc.getResult(),
+                allocResult);
             use.set(newAlloc.getResult());
           }
         }
@@ -456,8 +458,9 @@ class InsertGPUAllocsPass final
             /*symbolOperands*/ std::nullopt, hostShared);
         auto allocResult = gpuAlloc.getResult(0);
         if (access.hostWrite && access.deviceRead) {
-          auto copy =
-              builder.create<mlir::memref::CopyOp>(loc, op, allocResult);
+          auto copy = builder.create<mlir::gpu::MemcpyOp>(
+              loc, /*asyncToken*/ static_cast<mlir::Type>(nullptr),
+              /*asyncDependencies*/ std::nullopt, allocResult, op);
           filter.insert(copy);
         }
 
@@ -476,7 +479,9 @@ class InsertGPUAllocsPass final
           op.replaceAllUsesExcept(allocResult, filter);
           builder.setInsertionPoint(term);
           if (access.hostRead && access.deviceWrite) {
-            builder.create<mlir::memref::CopyOp>(loc, allocResult, op);
+            builder.create<mlir::gpu::MemcpyOp>(
+                loc, /*asyncToken*/ static_cast<mlir::Type>(nullptr),
+                /*asyncDependencies*/ std::nullopt, op, allocResult);
           }
           builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt, allocResult);
         }
diff --git a/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir b/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir
index f7beea259..bc9b661bd 100644
--- a/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir
+++ b/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir
@@ -7,9 +7,9 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
   %c1 = arith.constant 1 : index
   %c5 = arith.constant 5 : index
   // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32>
   // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32>
   // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
   // VULKAN: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
   // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>