diff --git a/include/imex/Transforms/Passes.h b/include/imex/Transforms/Passes.h index a697e1b49..e82ac8f90 100644 --- a/include/imex/Transforms/Passes.h +++ b/include/imex/Transforms/Passes.h @@ -19,12 +19,15 @@ #include namespace imex { +struct InsertGPUAllocsOptions; //===----------------------------------------------------------------------===// // Passes //===----------------------------------------------------------------------===// std::unique_ptr createSerializeSPIRVPass(); std::unique_ptr createInsertGPUAllocsPass(const char *clientAPI = "vulkan"); +std::unique_ptr +createInsertGPUAllocsPass(const InsertGPUAllocsOptions &); std::unique_ptr createSetSPIRVCapabilitiesPass(); std::unique_ptr createSetSPIRVAbiAttributePass(const char *clientAPI = "vulkan"); diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td index 5717c996d..4de20b13b 100644 --- a/include/imex/Transforms/Passes.td +++ b/include/imex/Transforms/Passes.td @@ -41,7 +41,11 @@ def InsertGPUAllocs : Pass<"insert-gpu-allocs", "::mlir::func::FuncOp"> { Option<"clientAPI", "client-api", "std::string", /*default=*/"\"opencl\"", "The client API to use for inserting gpu allocs">, Option<"inRegions", "in-regions", "bool", "false", - "Add gpu allocs only for memref.AllocOps within GPU regions"> + "Add gpu allocs only for memref.AllocOps within GPU regions">, + Option<"isUsmArgs", "is-usm-args", "bool", "false", + "Whether to use USM(unified shared memory) func args, in which the " + "host and device could access the same buffer and there is no need " + "to add memcpy explicitly"> ]; } diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp index d9e350e2c..178a8ac72 100644 --- a/lib/Transforms/InsertGPUAllocs.cpp +++ b/lib/Transforms/InsertGPUAllocs.cpp @@ -47,6 +47,12 @@ class InsertGPUAllocsPass final explicit InsertGPUAllocsPass() : m_clientAPI("vulkan") {} explicit InsertGPUAllocsPass(const mlir::StringRef &clientAPI) : m_clientAPI(clientAPI) {} + explicit InsertGPUAllocsPass(const imex::InsertGPUAllocsOptions &options) + : InsertGPUAllocsBase(options) { + if (clientAPI == "opencl") { + m_clientAPI = "opencl"; + } + } mlir::LogicalResult initializeOptions(mlir::StringRef options, @@ -540,15 +546,17 @@ class InsertGPUAllocsPass final // This is the case where the inputs are passed as arguments to the // function. This code will add the IR for memory allocation on the device // with gpu.alloc and insert a memref.copy from host to device - for (const auto &it : gpuBufferParams) { - auto param = block.getArgument(it.first); - if (isGpuAddrSpace(param)) - continue; - auto access = getAccessType(param); - access.hostRead = true; - access.hostWrite = true; - builder.setInsertionPointToStart(&block); - add_gpu_alloc(builder, param, access, term); + if (!isUsmArgs.getValue()) { + for (const auto &it : gpuBufferParams) { + auto param = block.getArgument(it.first); + if (isGpuAddrSpace(param)) + continue; + auto access = getAccessType(param); + access.hostRead = true; + access.hostWrite = true; + builder.setInsertionPointToStart(&block); + add_gpu_alloc(builder, param, access, term); + } } // CallOp Case: This is the case where the memref producer is coming @@ -580,4 +588,8 @@ namespace imex { std::unique_ptr createInsertGPUAllocsPass(const char *clientAPI) { return std::make_unique(clientAPI); } +std::unique_ptr +createInsertGPUAllocsPass(const InsertGPUAllocsOptions &option) { + return std::make_unique(option); +} } // namespace imex diff --git a/test/Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir b/test/Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir index 2a81c56aa..765b3f0d8 100644 --- a/test/Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir +++ b/test/Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir @@ -65,3 +65,43 @@ module @eltwise_add attributes {gpu.container_module} { } func.func private @printMemrefBF16(memref<*xbf16>) } + + +module @eltwise_add_usm attributes {gpu.container_module} { + memref.global "private" constant @__constant_10x20xbf16 : memref<10x20xbf16> = dense<5.000000e-01> + func.func @test(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>) -> memref<10x20xbf16> { + %c20 = arith.constant 20 : index + %c10 = arith.constant 10 : index + %c1 = arith.constant 1 : index + %memref_1 = gpu.alloc host_shared () : memref<10x20xbf16> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c10, %c20, %c1) threads in (%c1, %c1, %c1) args(%arg0 : memref<10x20xbf16>, %arg1 : memref<10x20xbf16>, %memref_1 : memref<10x20xbf16>) + %alloc = memref.alloc() : memref<10x20xbf16> + memref.copy %memref_1, %alloc : memref<10x20xbf16> to memref<10x20xbf16> + gpu.dealloc %memref_1 : memref<10x20xbf16> + return %alloc : memref<10x20xbf16> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>, %arg2: memref<10x20xbf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %cst = arith.constant 0.5 : bf16 + %0 = memref.load %arg0[%block_id_x, %block_id_y] : memref<10x20xbf16> + %1 = memref.load %arg1[%block_id_x, %block_id_y] : memref<10x20xbf16> + %2 = arith.addf %0, %1 : bf16 + %3 = arith.addf %2, %cst : bf16 + memref.store %3, %arg2[%block_id_x, %block_id_y] : memref<10x20xbf16> + gpu.return + } + } + func.func @main() { + %0 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16> + %1 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16> + %2 = call @test(%0, %1) : (memref<10x20xbf16>, memref<10x20xbf16>) -> memref<10x20xbf16> + %cast = memref.cast %2 : memref<10x20xbf16> to memref<*xbf16> + // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}} + // CHECK-COUNT-200: 1.5 + call @printMemrefBF16(%cast) : (memref<*xbf16>) -> () + return + } + func.func private @printMemrefBF16(memref<*xbf16>) attributes {llvm.emit_c_interface} +} diff --git a/test/Transforms/InsertGpuAllocs/skip-gpu-alloc-for-usm-args.mlir b/test/Transforms/InsertGpuAllocs/skip-gpu-alloc-for-usm-args.mlir new file mode 100644 index 000000000..cd64312ed --- /dev/null +++ b/test/Transforms/InsertGpuAllocs/skip-gpu-alloc-for-usm-args.mlir @@ -0,0 +1,54 @@ +// RUN: imex-opt --insert-gpu-allocs='client-api=opencl is-usm-args=1' %s | FileCheck %s --check-prefix=OPENCL +// RUN: imex-opt --insert-gpu-allocs='client-api=vulkan is-usm-args=1' %s | FileCheck %s --check-prefix=VULKAN + +// OPENCL-LABEL: func.func @addt +// OPENCL-SAME: %[[arg0:.+]]: memref<2x5xf32>, %[[arg1:.+]]: memref<2x5xf32>, %[[out_buff:.+]]: memref<2x5xf32> +// VULKAN-LABEL: func.func @addt +// VULKAN-SAME: %[[arg0:.+]]: memref<2x5xf32>, %[[arg1:.+]]: memref<2x5xf32>, %[[out_buff:.+]]: memref<2x5xf32> +func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memref<2x5xf32>) -> memref<2x5xf32> { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c5 = arith.constant 5 : index + // OPENCL-NOT: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32> + // OPENCL-NOT: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32> + // OPENCL-NOT: memref.copy + // OPENCL-NOT: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32> + // OPENCL-NOT: memref.copy + + // VULKAN-NOT: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32> + // VULKAN-NOT: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32> + // VULKAN-NOT: memref.copy + // VULKAN-NOT: %[[MEMREF2:.*]] = memref.alloc() : memref<2x5xf32> + // VULKAN-NOT: memref.copy + + %tmp_buff = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32> + // OPENCL-NOT: %[[MEMREF3:.*]] = memref.alloc().* + // OPENCL: %[[MEMREF3:.*]] = gpu.alloc () : memref<2x5xf32> + // VULKAN: %[[MEMREF3:.*]] = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32> + + %c1_0 = arith.constant 1 : index + %1 = affine.apply affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>(%c2)[%c0, %c1] + %2 = affine.apply affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>(%c5)[%c0, %c1] + gpu.launch blocks(%arg2, %arg3, %arg4) in (%arg8 = %1, %arg9 = %2, %arg10 = %c1_0) threads(%arg5, %arg6, %arg7) in (%arg11 = %c1_0, %arg12 = %c1_0, %arg13 = %c1_0) { + %3 = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>(%arg2)[%c1, %c0] + %4 = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>(%arg3)[%c1, %c0] + %5 = memref.load %arg0[%3, %4] : memref<2x5xf32> + %6 = memref.load %arg1[%3, %4] : memref<2x5xf32> + %7 = arith.addf %5, %6 : f32 + memref.store %7, %tmp_buff[%3, %4] : memref<2x5xf32> + + %8 = memref.load %tmp_buff[%3, %4] : memref<2x5xf32> + %9 = arith.addf %8, %5 : f32 + memref.store %9, %out_buff[%3, %4] : memref<2x5xf32> + + gpu.terminator + } {SCFToGPU_visited} + + // OPENCL-NOT: memref.dealloc %[[MEMREF3]] : memref<2x5xf32> + // OPENCL: gpu.dealloc %[[MEMREF3]] : memref<2x5xf32> + // VULKAN: memref.dealloc %[[MEMREF3]] : memref<2x5xf32> + memref.dealloc %tmp_buff : memref<2x5xf32> + + return %out_buff : memref<2x5xf32> +}