Skip to content

Commit

Permalink
Move chunk_size into TensorDesc attribute
Browse files Browse the repository at this point in the history
  • Loading branch information
chencha3 committed Aug 2, 2024
1 parent b523ab2 commit 1d4d93d
Show file tree
Hide file tree
Showing 51 changed files with 962 additions and 573 deletions.
395 changes: 395 additions & 0 deletions build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1407,7 +1407,7 @@ struct XeGPUToVCPass : public ::imex::ConvertXeGPUToVCBase<XeGPUToVCPass> {

typeConverter.addConversion(
[&](xegpu::TensorDescType type) -> ::mlir::Type {
if (type.getScattered()) {
if (type.isScattered()) {
return ::mlir::VectorType::get(
16, ::mlir::IndexType::get(&getContext()));
}
Expand Down
3 changes: 1 addition & 2 deletions lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,7 @@ class SgInitTileOpPattern : public XeOneToNConversion<xetile::InitTileOp> {
std::swap(offsetsX, offsetsY);

auto tDescTy = mlir::xegpu::TensorDescType::get(
innerBlk, elemTy, false /*scattered*/, array_length, memoryScope,
true /*boundary_check*/);
innerBlk, elemTy, array_length, true /*boundary_check*/, memoryScope);

auto createIndexConstant = [&](mlir::Type type, int64_t value) {
auto attr = rewriter.getIndexAttr(value);
Expand Down
4 changes: 2 additions & 2 deletions test/Conversion/XeGPUToVC/atomiclsc.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ module @gemm attributes {gpu.container_module} {
// CHECK: %[[ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<4> : vector<16xindex>
// CHECK: %[[OFFSETS_ADJUSTED:.*]] = arith.muli %[[ELEMENT_BYTEWIDTH]], %[[OFFSETS]] : vector<16xindex>
// CHECK: %[[VEC_OFFSETS_APPLIED:.*]] = arith.addi %[[VEC_BASEPTR_SHUFFLED]], %[[OFFSETS_ADJUSTED]] : vector<16xindex>
%2 = xegpu.create_tdesc %arg0, %offsets {chunk_size = 1} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
%2 = xegpu.create_tdesc %arg0, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>

// CHECK: %[[cst_3:.*]] = arith.constant dense<true> : vector<16xi1>
// CHECK: %[[cst_8:.*]] = arith.constant dense<0> : vector<16xi32>
// CHECK: %[[SRC0:.*]] = vector.bitcast %[[cst_0]] : vector<16xf32> to vector<16xi32>
// CHECK: %[[ATOMIC_RES:.*]] = func.call @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64({{.*}}, %[[VEC_OFFSETS_APPLIED]], %[[SRC0]], %[[cst_8]], {{.*}}, %[[cst_8]]) : ({{.*}}) -> vector<16xi32>
// CHECK: %{{.*}} = vector.bitcast %[[ATOMIC_RES]] : vector<16xi32> to vector<16xf32>
%3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
%3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
gpu.return
}
}
Expand Down
8 changes: 4 additions & 4 deletions test/Conversion/XeGPUToVC/loadgather.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ module @gemm attributes {gpu.container_module} {
// CHECK: %[[IN_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
// CHECK: %[[IN_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[IN_ELEMENT_BYTEWIDTH]], %[[OFFSET]] : vector<16xindex>
// CHECK: %[[IN_PAYLOAD:.*]] = arith.addi %[[IN_PAYLOAD_BASEPTR_SHUFFLED]], %[[IN_ELEMENTWISE_OFFSET]] : vector<16xindex>
%tdesc_in = xegpu.create_tdesc %in, %offsets {chunk_size = 2} : memref<?xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>
%tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>

// CHECK: %[[OUT_EMPTY_PAYLOAD:.*]] = arith.constant dense<0> : vector<16xindex>
// CHECK: %[[OUT_BASEPTR:.*]] = memref.extract_aligned_pointer_as_index {{.*}} : memref<32xf16> -> index
Expand All @@ -28,16 +28,16 @@ module @gemm attributes {gpu.container_module} {
// CHECK: %[[OUT_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
// CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OFFSET]] : vector<16xindex>
// CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex>
%tdesc_out = xegpu.create_tdesc %out_flat, %offsets {chunk_size = 2} : memref<32xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>
%tdesc_out = xegpu.create_tdesc %out_flat, %offsets {chunk_size = 2} : memref<32xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>

// CHECK: %[[OLD:.*]] = arith.constant dense<0> : vector<16xi32>
// CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v16i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> vector<16xi32>
%loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16x2xf16>
%loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xf16>
// CHECK: %[[POST_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[LOAD_RES]] : vector<16xi32> to vector<32xf16>

// CHECK: %[[PRE_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[POST_OP_ELEMENT_TYPE_CAST]] : vector<32xf16> to vector<16xi32>
// CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v16i32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[PRE_OP_ELEMENT_TYPE_CAST]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> ()
xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>

gpu.return
}
Expand Down
8 changes: 4 additions & 4 deletions test/Conversion/XeGPUToVC/loadgather_dpas.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ module @gemm attributes {gpu.container_module} {
// CHECK: %[[IN_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
// CHECK: %[[IN_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[IN_ELEMENT_BYTEWIDTH]], %[[IN_OFFSET]] : vector<16xindex>
// CHECK: %[[IN_PAYLOAD:.*]] = arith.addi %[[IN_PAYLOAD_BASEPTR_SHUFFLED]], %[[IN_ELEMENTWISE_OFFSET]] : vector<16xindex>
%0 = xegpu.create_tdesc %arg0, %offsets {chunk_size = 8} : memref<128xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr<scattered = true>>
%0 = xegpu.create_tdesc %arg0, %offsets : memref<128xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>

// CHECK: %[[OLD:.*]] = arith.constant dense<0> : vector<64xi32>
// CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v64i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<64xi32>) -> vector<64xi32>
%3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16x8xf16>
%3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1> -> vector<16x8xf16>

// CHECK: %[[LOADA_v128f16:.*]] = vector.bitcast %[[LOAD_RES]] : vector<64xi32> to vector<128xf16>
%66 = vector.shape_cast %3: vector<16x8xf16> to vector<128xf16>
Expand Down Expand Up @@ -58,12 +58,12 @@ module @gemm attributes {gpu.container_module} {
// CHECK: %[[OUT_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<4> : vector<16xindex>
// CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OUT_OFFSET]] : vector<16xindex>
// CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex>
%2 = xegpu.create_tdesc %arg2, %offsets2 {chunk_size = 8} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
%2 = xegpu.create_tdesc %arg2, %offsets2 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
%7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32>
%8 = vector.shape_cast %7: vector<128xf32> to vector<16x8xf32>

// CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v128f32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[C_ACC_v128f32]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<128xf32>) -> ()
xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1>

gpu.return
}
Expand Down
16 changes: 8 additions & 8 deletions test/Conversion/XeTileToXeGPU/reduction.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ module {
%c0 = arith.constant 0 : index
%acc = arith.constant dense<0.0> : vector<16xf16>
//CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16>
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true, scattered = false>>
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true>>
%t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
//CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
//CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<16x32xf16>
//CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
%v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>

//CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
Expand Down Expand Up @@ -179,9 +179,9 @@ module {
%r = vector.multi_reduction <add>, %e, %acc [1] : vector<16x32xf16> to vector<16xf16>
//CHECK: %[[R161:.*]] = vector.shape_cast %[[R160]] : vector<16xf16> to vector<2x8xf16>
%c = vector.shape_cast %r: vector<16xf16> to vector<2x8xf16>
//CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true, scattered = false>>
//CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true>>
%s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<2x8xf16>
//CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true, scattered = false>>
//CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true>>
xetile.store_tile %c, %s : vector<2x8xf16>, !xetile.tile<2x8xf16>
gpu.return
}
Expand All @@ -193,10 +193,10 @@ module {
%c0 = arith.constant 0 : index
%acc = arith.constant dense<0.0> : vector<32xf16>
//CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16>
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true, scattered = false>>
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true>>
%t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
//CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<16x32xf16>
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
%v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
//CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
//CHECK: %[[R3:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
Expand Down Expand Up @@ -318,9 +318,9 @@ module {
%r = vector.multi_reduction <add>, %e, %acc [0] : vector<16x32xf16> to vector<32xf16>
//CHECK: %[[R118:.*]] = vector.shape_cast %[[R117]] : vector<32xf16> to vector<4x8xf16>
%c = vector.shape_cast %r: vector<32xf16> to vector<4x8xf16>
//CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true, scattered = false>>
//CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true>>
%s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16>
//CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true, scattered = false>>
//CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_scope = global, array_length = 1 : i64, boundary_check = true>>
xetile.store_tile %c, %s : vector<4x8xf16>, !xetile.tile<4x8xf16>
gpu.return
}
Expand Down
Loading

0 comments on commit 1d4d93d

Please sign in to comment.