Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
silee2 committed Jun 11, 2024
2 parents b7e7151 + 5c0ef3f commit 5fd3105
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 12 deletions.
6 changes: 5 additions & 1 deletion test/Conversion/XeGPUToVC/gemm-scf.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ module @gemm attributes {gpu.container_module} {
// LSC: %[[A_OFFSETX:.*]] = vector.extract %[[A_PAYLOAD]][5] : i32 from vector<8xi32>
// LSC: %[[A_OFFSETY:.*]] = vector.extract %[[A_PAYLOAD]][6] : i32 from vector<8xi32>
// LSC: %[[LOAD2D_A_v64i32:.*]] = func.call @llvm.genx.lsc.load2d.stateless.v64i32.i1.i64({{.*}}, %[[BASE_A]], {{.*}}, %[[A_OFFSETX]], %[[A_OFFSETY]]) : (i1, i8, i8, i8, i8, i8, i32, i32, i8, i64, i32, i32, i32, i32, i32) -> vector<64xi32>
// CHECK: %[[LOAD2D_A_v64i32_CAST:.*]] = vector.bitcast %[[LOAD2D_A_v64i32]] : vector<64xi32> to vector<128xf16>
%9 = xegpu.load_nd %7 {vnni_axis = 1}: !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>

// RAW: %[[LOAD2D_B_v128i32:.*]] = func.call @llvm.genx.raw.send2.v128i32.i1.v8i32({{.*}}, %[[B_PAYLOAD]], %{{.*}}) : (i8, i8, i1, i8, i8, i8, i32, i32, vector<8xi32>, vector<128xi32>) -> vector<128xi32>
Expand All @@ -82,9 +83,12 @@ module @gemm attributes {gpu.container_module} {
// LSC: %[[B_OFFSETX:.*]] = vector.extract %[[B_PAYLOAD]][5] : i32 from vector<8xi32>
// LSC: %[[B_OFFSETY:.*]] = vector.extract %[[B_PAYLOAD]][6] : i32 from vector<8xi32>
// LSC: %[[LOAD2D_B_v128i32:.*]] = func.call @llvm.genx.lsc.load2d.stateless.v128i32.i1.i64({{.*}}, %[[BASE_B]], {{.*}}, %[[B_OFFSETX]], %[[B_OFFSETY]]) : {{.*}} -> vector<128xi32>
// CHECK: %[[LOAD2D_B_v128i32_CAST:.*]] = vector.bitcast %[[LOAD2D_B_v128i32]] : vector<128xi32> to vector<256xf16>
%10 = xegpu.load_nd %8 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>

// CHECK: %[[DPAS_RES:.*]] = func.call @llvm.genx.dpas2.v128f32.v128i32.v64i32(%[[arg4]], %[[LOAD2D_B_v128i32]], %[[LOAD2D_A_v64i32]], {{.*}}) : (vector<128xf32>, vector<128xi32>, vector<64xi32>, i32, i32, i32, i32, i32, i32) -> vector<128xf32>
// CHECK: %[[LOAD2D_A_v64i32_RECAST:.*]] = vector.bitcast %[[LOAD2D_A_v64i32_CAST]] : vector<128xf16> to vector<64xi32>
// CHECK: %[[LOAD2D_B_v128i32_RECAST:.*]] = vector.bitcast %[[LOAD2D_B_v128i32_CAST]] : vector<256xf16> to vector<128xi32>
// CHECK: %[[DPAS_RES:.*]] = func.call @llvm.genx.dpas2.v128f32.v128i32.v64i32(%[[arg4]], %[[LOAD2D_B_v128i32_RECAST]], %[[LOAD2D_A_v64i32_RECAST]], {{.*}}) : (vector<128xf32>, vector<128xi32>, vector<64xi32>, i32, i32, i32, i32, i32, i32) -> vector<128xf32>
%11 = xegpu.dpas %9, %10, %arg4 : vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
// CHECK: scf.yield %[[DPAS_RES]] : vector<128xf32>
scf.yield %11 : vector<8x16xf32>
Expand Down
4 changes: 0 additions & 4 deletions test/Conversion/XeGPUToVC/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
local_excludes = [
"gemm-scf.mlir",
"loadgather.mlir",
"prefetchnd.mlir",
"xegpu-to-vc.mlir",
]
if(not config.imex_enable_excluded_tests):
config.excludes.update(local_excludes)
2 changes: 1 addition & 1 deletion test/Conversion/XeGPUToVC/loadgather.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ module @gemm attributes {gpu.container_module} {
// CHECK: %[[LOADA_v128f16:.*]] = vector.bitcast %[[LOADA_v128i32]] : vector<64xi32> to vector<128xf16>
%66 = vector.shape_cast %3: vector<16x8xf16> to vector<128xf16>

// CHECK: %[[LOADA_v64i32:.*]] = vector.bitcast %[[LOADA_v128f16]] : vector<128xf16> to vector<64xi32>
%6 = vector.shape_cast %66: vector<128xf16> to vector<8x8x2xf16>

// CHECK: %[[B_STRUCT:.*]]= arith.constant dense<0> : vector<4xi64>
Expand All @@ -46,6 +45,7 @@ module @gemm attributes {gpu.container_module} {

%4 = xegpu.load_nd %1 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>

// CHECK: %[[LOADA_v64i32:.*]] = vector.bitcast %[[LOADA_v128f16]] : vector<128xf16> to vector<64xi32>
// CHECK: %[[C_ACC_v128f32:.*]] = func.call @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(%{{.*}}, %[[LOADA_v64i32]], %{{.*}}) : (vector<128xi32>, vector<64xi32>, i32) -> vector<128xf32>
%5 = xegpu.dpas %6, %4 : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>

Expand Down
6 changes: 5 additions & 1 deletion test/Conversion/XeGPUToVC/prefetchnd.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ module @gemm attributes {gpu.container_module} {
// LSC: %[[LOAD2D_A_OFFSETX:.*]] = vector.extract %[[A_PAYLOAD]][5] : i32 from vector<8xi32>
// LSC: %[[LOAD2D_A_OFFSETY:.*]] = vector.extract %[[A_PAYLOAD]][6] : i32 from vector<8xi32>
// LSC: %[[LOAD2D_A_v64i32:.*]] = func.call @llvm.genx.lsc.load2d.stateless.v64i32.i1.i64({{.*}}, %[[LOAD2D_BASE_A]], {{.*}}, %[[LOAD2D_A_OFFSETX]], %[[LOAD2D_A_OFFSETY]]) : ({{.*}}) -> vector<64xi32>
// CHECK: %[[LOAD2D_A_v64i32_CAST:.*]] = vector.bitcast %[[LOAD2D_A_v64i32]] : vector<64xi32> to vector<128xf16>
%3 = xegpu.load_nd %0 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>

// RAW: %[[cst_39:.*]] = arith.constant dense<0> : vector<128xi32>
Expand All @@ -95,9 +96,12 @@ module @gemm attributes {gpu.container_module} {
// LSC: %[[LOAD2D_B_OFFSETX:.*]] = vector.extract %[[B_PAYLOAD]][5] : i32 from vector<8xi32>
// LSC: %[[LOAD2D_B_OFFSETY:.*]] = vector.extract %[[B_PAYLOAD]][6] : i32 from vector<8xi32>
// LSC: %[[LOAD2D_B_v128i32:.*]] = func.call @llvm.genx.lsc.load2d.stateless.v128i32.i1.i64({{.*}}, %[[LOAD2D_BASE_B]], {{.*}}, %[[LOAD2D_B_OFFSETX]], %[[LOAD2D_B_OFFSETY]]) : ({{.*}}) -> vector<128xi32>
// CHECK: %[[LOAD2D_B_v128i32_CAST:.*]] = vector.bitcast %[[LOAD2D_B_v128i32]] : vector<128xi32> to vector<256xf16>
%4 = xegpu.load_nd %1 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>

// CHECK: %[[C_ACC_v128f32:.*]] = func.call @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(%[[LOAD2D_B_v128i32]], %[[LOAD2D_A_v64i32]], %{{.*}}) : (vector<128xi32>, vector<64xi32>, i32) -> vector<128xf32>
// CHECK: %[[LOAD2D_A_v64i32_RECAST:.*]] = vector.bitcast %[[LOAD2D_A_v64i32_CAST]] : vector<128xf16> to vector<64xi32>
// CHECK: %[[LOAD2D_B_v128i32_RECAST:.*]] = vector.bitcast %[[LOAD2D_B_v128i32_CAST]] : vector<256xf16> to vector<128xi32>
// CHECK: %[[C_ACC_v128f32:.*]] = func.call @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(%[[LOAD2D_B_v128i32_RECAST]], %[[LOAD2D_A_v64i32_RECAST]], %{{.*}}) : (vector<128xi32>, vector<64xi32>, i32) -> vector<128xf32>
%5 = xegpu.dpas %3, %4 : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>

// RAW: func.call @llvm.genx.raw.sends2.noresult.i1.v8i32.v128f32({{.*}}, %[[C_PAYLOAD]], %[[C_ACC_v128f32]]) : ({{.*}}, vector<8xi32>, vector<128xf32>) -> ()
Expand Down
10 changes: 5 additions & 5 deletions test/Conversion/XeGPUToVC/xegpu-to-vc.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,24 @@ module @gemm attributes {gpu.container_module} {
// LSC: %[[A_v4i64:.*]] = vector.bitcast %[[A_PAYLOAD_v8i32]] : vector<8xi32> to vector<4xi64>
// LSC: %[[BASE_A:.*]] = vector.extract %[[A_v4i64]][0] : i64 from vector<4xi64>
// LSC: %[[LOAD2D_A_v32i64:.*]] = func.call @llvm.genx.lsc.load.stateless.v32i64.i1.i64({{.*}}, %[[BASE_A]], %{{.*}}) : ({{.*}}) -> vector<32xi64>
// CHECK: %[[LOAD2D_A_v32i64_CAST:.*]] = vector.bitcast %[[LOAD2D_A_v32i64]] : vector<32xi64> to vector<128xf16>
%3 = xegpu.load_nd %0 : !xegpu.tensor_desc<128xf16> -> vector<128xf16>

// CHECK: %[[LOAD2D_A_v128f16:.*]] = vector.bitcast %[[LOAD2D_A_v32i64]] : vector<32xi64> to vector<128xf16>

// RAW: %[[LOAD2D_B_v128i32:.*]] = func.call @llvm.genx.raw.send2.v128i32.i1.v8i32({{.*}}, %[[B_PAYLOAD]], %{{.*}}) : ({{.*}}) -> vector<128xi32>

// LSC: %[[B_v4i64:.*]] = vector.bitcast %[[B_PAYLOAD]] : vector<8xi32> to vector<4xi64>
// LSC: %[[BASE_B:.*]] = vector.extract %[[B_v4i64]][0] : i64 from vector<4xi64>
// LSC: %[[B_OFFSETX:.*]] = vector.extract %[[B_PAYLOAD]][5] : i32 from vector<8xi32>
// LSC: %[[B_OFFSETY:.*]] = vector.extract %[[B_PAYLOAD]][6] : i32 from vector<8xi32>
// LSC: %[[LOAD2D_B_v128i32:.*]] = func.call @llvm.genx.lsc.load2d.stateless.v128i32.i1.i64({{.*}}, %[[BASE_B]], {{.*}}, %[[B_OFFSETX]], %[[B_OFFSETY]]) : {{.*}} -> vector<128xi32>
// CHECK: %[[LOAD2D_B_v128i32_CAST:.*]] = vector.bitcast %[[LOAD2D_B_v128i32]] : vector<128xi32> to vector<256xf16>
%4 = xegpu.load_nd %1 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
%6 = vector.shape_cast %3: vector<128xf16> to vector<8x8x2xf16>

// CHECK: %[[LOAD2D_A_v64i32:.*]] = vector.bitcast %[[LOAD2D_A_v128f16]] : vector<128xf16> to vector<64xi32>
// RAW: %[[C_ACC_v128f32:.*]] = func.call @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(%[[LOAD2D_B_v128i32]], %[[LOAD2D_A_v64i32]], %{{.*}}) : (vector<128xi32>, vector<64xi32>, i32) -> vector<128xf32>
// CHECK: %[[LOAD2D_A_v32i64_RECAST:.*]] = vector.bitcast %[[LOAD2D_A_v32i64_CAST]] : vector<128xf16> to vector<64xi32>
// CHECK: %[[LOAD2D_B_v128i32_RECAST:.*]] = vector.bitcast %[[LOAD2D_B_v128i32_CAST]] : vector<256xf16> to vector<128xi32>
// CHECK: %[[C_ACC_v128f32:.*]] = func.call @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(%[[LOAD2D_B_v128i32_RECAST]], %[[LOAD2D_A_v32i64_RECAST]], %{{.*}}) : (vector<128xi32>, vector<64xi32>, i32) -> vector<128xf32>

// LSC: %[[C_ACC_v128f32:.*]] = func.call @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(%[[LOAD2D_B_v128i32]], %[[LOAD2D_A_v64i32]], %{{.*}}) : (vector<128xi32>, vector<64xi32>, i32) -> vector<128xf32>
%5 = xegpu.dpas %6, %4 : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
%7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32>

Expand Down
23 changes: 23 additions & 0 deletions test/Transforms/InsertGpuAllocs/add-gpu-alloc-xetile.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// RUN: imex-opt --insert-gpu-allocs='client-api=opencl' %s | FileCheck %s --check-prefix=OPENCL
// RUN: imex-opt --insert-gpu-allocs='client-api=vulkan' %s | FileCheck %s --check-prefix=VULKAN

func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf32> {
%c1 = arith.constant 1 : index
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc () : memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// VULKAN: %[[MEMREF0:.*]] = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>

%0 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
%1 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>

gpu.launch blocks(%arg2, %arg3, %arg4) in (%arg8 = %c1, %arg9 = %c1, %arg10 = %c1) threads(%arg5, %arg6, %arg7) in (%arg11 = %c1, %arg12 = %c1, %arg13 = %c1) {
%c0 = arith.constant 0 : index
%src_tile = xetile.init_tile %0[%c0, %c0] : memref<2x5xf32> -> !xetile.tile<2x5xf32>
%src_value = xetile.load_tile %src_tile : !xetile.tile<2x5xf32> -> vector<2x5xf32>
%res_tile = xetile.init_tile %1[%c0, %c0] : memref<2x5xf32> -> !xetile.tile<2x5xf32>
xetile.store_tile %src_value, %res_tile: vector<2x5xf32>, !xetile.tile<2x5xf32>
gpu.terminator
} {SCFToGPU_visited}
return %1 : memref<2x5xf32>
}

0 comments on commit 5fd3105

Please sign in to comment.