Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GC-STAGING][InsertGPUAllocs] memref.copy -> gpu.memcpy #980

Open
wants to merge 1 commit into
base: gc-staging
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 68 additions & 17 deletions lib/Transforms/InsertGPUAllocs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,32 @@ class InsertGPUAllocsPass final
return mlir::success();
}

// Go through all users of 'memory' recursively and replace
// all memref.copy with gpu.memcpy
static void replaceMemrefCopyWithGpuMemcpy(mlir::OpBuilder &builder,
mlir::Value memory) {
mlir::SmallVector<mlir::memref::CopyOp> toErase;

for (auto u : memory.getUsers()) {
if (auto copyOp = mlir::dyn_cast<mlir::memref::CopyOp>(u)) {
builder.setInsertionPoint(copyOp);
builder.create<mlir::gpu::MemcpyOp>(
copyOp.getLoc(), /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/copyOp.getTarget(),
/*src=*/copyOp.getSource());
toErase.push_back(copyOp);
} else if (u->getNumResults() == 0)
Comment on lines +79 to +91

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it check if we are in the gpu module?

continue;
else {
for (auto result : u->getResults())
replaceMemrefCopyWithGpuMemcpy(builder, result);
}
}

for (auto copyOp : toErase)
copyOp.erase();
}

void runOnOperation() override {
auto func = getOperation();
auto &funcBody = func.getBody();
Expand Down Expand Up @@ -382,12 +408,14 @@ class InsertGPUAllocsPass final
}

if (auto copy = mlir::dyn_cast<mlir::memref::CopyOp>(user)) {
if (copy.getSource() == mem)
ret.hostRead = true;

if (copy.getTarget() == mem)
ret.hostWrite = true;

// All memref.copy ops should be replaced with gpu.memcpy by
// 'add_gpu_alloc' function, so we shouldn't count them as "host
// usage". We may need to uncomment this code and do some analysis
// here instead if a case where it doesn't work would be found. if
// (copy.getSource() == mem)
// ret.hostRead = true;
// if (copy.getTarget() == mem)
// ret.hostWrite = true;
continue;
}

Expand Down Expand Up @@ -433,23 +461,29 @@ class InsertGPUAllocsPass final
/*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(),
alloc.getSymbolOperands(), hostShared);
auto allocResult = gpuAlloc.getResult(0);

auto memory = alloc->getResult(0);
replaceMemrefCopyWithGpuMemcpy(builder, memory);

builder.setInsertionPoint(term);
for (mlir::OpOperand &use : alloc.getResult().getUses()) {
if (use.getOwner() == term) {
auto newAlloc = builder.create<mlir::memref::AllocOp>(
loc, alloc.getType(), alloc.getDynamicSizes(),
alloc.getSymbolOperands());
builder.create<mlir::memref::CopyOp>(loc, allocResult,
newAlloc.getResult());
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/newAlloc.getResult(),
/*src=*/allocResult);
use.set(newAlloc.getResult());
}
}

// remove 'memref.dealloc' (it's later replaced with gpu.dealloc)
auto memory = alloc->getResult(0);
for (auto u : memory.getUsers()) {
if (auto dealloc = mlir::dyn_cast<mlir::memref::DeallocOp>(u)) {
dealloc.erase();
break;
}
}

Expand All @@ -469,6 +503,8 @@ class InsertGPUAllocsPass final
filter.clear();
dims.clear();

replaceMemrefCopyWithGpuMemcpy(builder, op);

// This code handles dynamic dims with known rank.
for (auto i : llvm::seq(0u, rank)) {
if (memrefType.isDynamicDim(i)) {
Expand All @@ -489,8 +525,9 @@ class InsertGPUAllocsPass final
/*symbolOperands*/ std::nullopt, hostShared);
auto allocResult = gpuAlloc.getResult(0);
if (access.hostWrite && access.deviceRead) {
auto copy =
builder.create<mlir::memref::CopyOp>(loc, op, allocResult);
auto copy = builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), allocResult, op);
filter.insert(copy);
}

Expand All @@ -501,15 +538,21 @@ class InsertGPUAllocsPass final
op.replaceAllUsesExcept(castedAllocResult, filter);
builder.setInsertionPoint(term);
if (access.hostRead && access.deviceWrite) {
builder.create<mlir::memref::CopyOp>(loc, castedAllocResult, op);
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
/*src=*/castedAllocResult);
}
builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt,
castedAllocResult);
} else {
op.replaceAllUsesExcept(allocResult, filter);
builder.setInsertionPoint(term);
if (access.hostRead && access.deviceWrite) {
builder.create<mlir::memref::CopyOp>(loc, allocResult, op);
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
/*src=*/allocResult);
}
builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt, allocResult);
}
Expand All @@ -518,8 +561,10 @@ class InsertGPUAllocsPass final
builder.create<mlir::memref::AllocOp>(loc, allocType, dims);
auto allocResult = gpuAlloc.getResult();
if (access.hostWrite && access.deviceRead) {
auto copy =
builder.create<mlir::memref::CopyOp>(loc, op, allocResult);
auto copy = builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/allocResult,
/*src=*/op);
filter.insert(copy);
}

Expand All @@ -530,13 +575,19 @@ class InsertGPUAllocsPass final
op.replaceAllUsesExcept(castedAllocResult, filter);
builder.setInsertionPoint(term);
if (access.hostRead && access.deviceWrite) {
builder.create<mlir::memref::CopyOp>(loc, castedAllocResult, op);
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
/*src=*/castedAllocResult);
}
} else {
op.replaceAllUsesExcept(allocResult, filter);
builder.setInsertionPoint(term);
if (access.hostRead && access.deviceWrite) {
builder.create<mlir::memref::CopyOp>(loc, allocResult, op);
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
/*src=*/allocResult);
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memre
%c5 = arith.constant 5 : index
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %[[arg1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF1]], %[[arg1]] : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %[[arg0]], %[[MEMREF2]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF2]], %[[arg0]] : memref<2x5xf32>, memref<2x5xf32>

// VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %[[arg1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF1]], %[[arg1]] : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: %[[MEMREF2:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %[[arg0]], %[[MEMREF2]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF2]], %[[arg0]] : memref<2x5xf32>, memref<2x5xf32>

%tmp_buff = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
// OPENCL-NOT: %[[MEMREF3:.*]] = memref.alloc().*
Expand Down Expand Up @@ -49,7 +49,7 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memre
// OPENCL: gpu.dealloc %[[MEMREF3]] : memref<2x5xf32>
// OPENCL: gpu.dealloc %[[MEMREF2]] : memref<2x5xf32>
// OPENCL: gpu.dealloc %[[MEMREF1]] : memref<2x5xf32>
// OPENCL: memref.copy %[[MEMREF0]], %[[out_buff]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[out_buff]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: gpu.dealloc %[[MEMREF0]] : memref<2x5xf32>
// VULKAN: memref.dealloc %[[MEMREF3]] : memref<2x5xf32>
memref.dealloc %tmp_buff : memref<2x5xf32>
Expand Down
8 changes: 4 additions & 4 deletions test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %gpu_arg0: memre
%c1 = arith.constant 1 : index
%c5 = arith.constant 5 : index
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32>

%0 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
// OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
Expand Down
12 changes: 6 additions & 6 deletions test/Transforms/InsertGpuAllocs/memref-get-global.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
%2 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>

// OPENCL: [[VAR0:%.*]] = memref.get_global @__constant_2x5xf32 : memref<2x5xf32>
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy [[VAR0]], %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc () : memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF0]], [[VAR0]] : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: [[VAR1:%.*]] = memref.get_global @__constant_2x5xf32_0 : memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy [[VAR1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc () : memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF1]], [[VAR1]] : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// VULKAN: [[VAR0:%.*]] = memref.get_global @__constant_2x5xf32 : memref<2x5xf32>
// VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy [[VAR0]], %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF0]], [[VAR0]] : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: [[VAR1:%.*]] = memref.get_global @__constant_2x5xf32_0 : memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy [[VAR1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF1]], [[VAR1]] : memref<2x5xf32>, memref<2x5xf32>

%c1_0 = arith.constant 1 : index
%3 = affine.apply affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>(%c2)[%c0, %c1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ func.func @main() {
// OPENCL: func.func @main()
%0 = func.call @alloc_buffer() : () -> memref<8xf32>
// OPENCL: %[[MEMREF:.*]] = gpu.alloc host_shared () : memref<8xf32>
// OPENCL: memref.copy %0, %[[MEMREF]] : memref<8xf32> to memref<8xf32>
// OPENCL: gpu.memcpy %[[MEMREF]], %0 : memref<8xf32>, memref<8xf32>
%1 = memref.alloc() : memref<8xf32>
%2 = memref.alloc() : memref<8xf32>
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c8, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
Expand Down
8 changes: 4 additions & 4 deletions test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
%c1 = arith.constant 1 : index
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %[[arg0]], %[[MEMREF1:.*]]
// OPENCL: gpu.memcpy %[[MEMREF1:.*]], %[[arg0]]
// VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %[[arg0]], %[[MEMREF1:.*]]
// VULKAN: gpu.memcpy %[[MEMREF1:.*]], %[[arg0]]

gpu.launch blocks(%arg2, %arg3, %arg4) in (%arg8 = %c1, %arg9 = %c1, %arg10 = %c1) threads(%arg5, %arg6, %arg7) in (%arg11 = %c1, %arg12 = %c1, %arg13 = %c1) {
%c0 = arith.constant 0 : index
Expand All @@ -23,8 +23,8 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
gpu.terminator
} {SCFToGPU_visited}

// OPENCL: memref.copy %[[MEMREF0]], %[[arg1]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: memref.copy %[[MEMREF0]], %[[arg1]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[arg1]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: gpu.memcpy %[[arg1]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>

return %arg1 : memref<2x5xf32>
}
Loading