From 1d4d93d27a92a1d4e142afc939973701f079686f Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 2 Aug 2024 19:05:22 +0000 Subject: [PATCH] Move chunk_size into TensorDesc attribute --- ...0013-Move-chunk_size-into-TensorDesc.patch | 395 ++++++++++++++++++ lib/Conversion/XeGPUToVC/XeGPUToVC.cpp | 2 +- .../XeTileToXeGPU/XeTileOpConversion.cpp | 3 +- test/Conversion/XeGPUToVC/atomiclsc.mlir | 4 +- test/Conversion/XeGPUToVC/loadgather.mlir | 8 +- .../Conversion/XeGPUToVC/loadgather_dpas.mlir | 8 +- test/Conversion/XeTileToXeGPU/reduction.mlir | 16 +- .../sg_gemm_1k_1k_1k_f16_f32.mlir | 222 +++++----- .../sg_gemm_1k_1k_1k_i8_i32.mlir | 38 +- .../sg_gemm_1k_1k_1k_tf32_tf32.mlir | 80 ++-- .../XeTileToXeGPU/sg_load_tile.mlir | 4 +- .../XeTileToXeGPU/sg_mixed_scf.mlir | 26 +- test/Conversion/XeTileToXeGPU/sg_scf_for.mlir | 26 +- test/Conversion/XeTileToXeGPU/sg_softmax.mlir | 16 +- .../XeTileToXeGPU/sg_store_tile.mlir | 32 +- .../Conversion/XeTileToXeGPU/sg_tile_mma.mlir | 12 +- .../XeTileToXeGPU/sg_tiled_broadcast.mlir | 4 +- .../XeTileToXeGPU/sg_tiled_load_tile.mlir | 4 +- .../XeTileToXeGPU/sg_tiled_scf_for.mlir | 26 +- .../XeTileToXeGPU/sg_tiled_softmax.mlir | 16 +- .../XeTileToXeGPU/sg_tiled_store_tile.mlir | 32 +- .../XeTileToXeGPU/sg_tiled_tile_mma.mlir | 12 +- test/Conversion/XeTileToXeGPU/test_order.mlir | 16 +- test/Dialect/XeGPU/IR/XeGPUOps.mlir | 9 +- test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir | 18 +- test/Dialect/XeGPU/IR/create_nd_tdesc.mlir | 16 +- test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir | 16 +- test/Dialect/XeGPU/IR/create_tdesc_vc.mlir | 32 +- test/Dialect/XeGPU/IR/invalid_vc.mlir | 13 +- test/Dialect/XeGPU/IR/load_gather_vc.mlir | 30 +- test/Dialect/XeGPU/IR/load_nd_vc.mlir | 8 +- test/Dialect/XeGPU/IR/store_scatter.mlir | 16 +- test/Dialect/XeGPU/IR/store_scatter_vc.mlir | 17 +- test/Dialect/XeGPU/IR/update_offset_vc.mlir | 12 +- .../XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir | 30 +- ...kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir | 30 +- ...4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir | 30 +- .../load_with_block_array_16_16_2.vc.mlir | 4 +- .../load_with_block_array_32_16_2.vc.mlir | 4 +- .../load_with_block_array_8_16_2.vc.mlir | 4 +- .../XeGPU/loadgather2d_masked_f32.mlir | 20 +- .../XeGPU/loadgather2d_masked_slm_f32.mlir | 20 +- .../XeGPU/loadgather_chunk_size_f32.mlir | 8 +- .../XeGPU/loadgather_chunk_size_i32.mlir | 8 +- .../Dialect/XeGPU/loadgather_f32.mlir | 8 +- .../Dialect/XeGPU/loadgather_masked_f32.mlir | 8 +- .../XeGPU/loadgather_masked_slm_f32.mlir | 8 +- .../Dialect/XeGPU/loadgather_slm_f32.mlir | 8 +- .../vector_extract_strided_slice_2.vc.mlir | 4 +- .../postop_reduce_n.mlir | 76 ++-- .../VectorLinearize/postop_reduce_n.mlir | 76 ++-- 51 files changed, 962 insertions(+), 573 deletions(-) create mode 100644 build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch diff --git a/build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch b/build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch new file mode 100644 index 000000000..9b190e0e3 --- /dev/null +++ b/build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch @@ -0,0 +1,395 @@ +From 94685ba4f22afa8922feebe292e8b525b8d012b7 Mon Sep 17 00:00:00 2001 +From: Chao Chen +Date: Mon, 29 Jul 2024 18:40:29 +0000 +Subject: [PATCH] Move chunk_size into TensorDesc + +--- + .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 43 ++++++++++--- + .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 12 ++-- + .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 63 ++++++++++++------- + mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 41 ++++++++---- + mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 21 ++++--- + 5 files changed, 121 insertions(+), 59 deletions(-) + +diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +index f3ca09a6a68e..1dfe55a4bba0 100644 +--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td ++++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +@@ -19,9 +19,15 @@ class XeGPUAttr traits = [], + let mnemonic = attrMnemonic; + } + +-def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { ++class XeGPU_TensorDescAttr traits = [], ++ string baseCppClass = "::mlir::Attribute"> ++ : XeGPUAttr { ++ let assemblyFormat = "`<` struct(params) `>`"; ++} ++ ++def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> { + let summary = [{a composite attribute for `TensorDescType`}]; +- let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite ++ let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite + attribute defined for `TensorDescType` for describing following + properties of a `TensorDesc`. + 1. `memory_scope`: It describes where the data block described by the +@@ -33,27 +39,46 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { + 8x32. Its default value is 1. + 3. `boundary_check`: It is used to indicates the hardware whether to do + out-of-boundary check. The default value is true. +- 4. `scattered`: It is used to differenciate TensorDescs created from +- `create_nd_tdesc` vs from `create_tdesc`. + }]; + + let parameters = (ins + OptionalParameter<"MemoryScopeAttr">: $memory_scope, + OptionalParameter<"IntegerAttr", "1">: $array_length, +- OptionalParameter<"BoolAttr", "true">: $boundary_check, +- OptionalParameter<"BoolAttr", "false">: $scattered ++ OptionalParameter<"BoolAttr", "true">: $boundary_check + ); + + let builders = [ + AttrBuilder<(ins + CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, + CArg<"int", "1">:$array_length, +- CArg<"bool", "true">: $boundary_check, +- CArg<"bool", "false">: $scattered ++ CArg<"bool", "true">: $boundary_check + )> + ]; ++} + +- let assemblyFormat = "`<` struct(params) `>`"; ++def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> { ++ let summary = [{a composite attribute for `TensorDescType`}]; ++ let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite ++ attribute defined for `TensorDescType` for describing following ++ properties of a `TensorDesc`. ++ 1. `memory_scope`: It describes where the data block described by the ++ TensorDesc is located, `Global` device memory or `Shared` local memory. ++ It is default to `Global`. ++ 2. `chunk_size`: indicates number of continious elements accessed for each ++ offset, default is 1. It is used with `scattered` attr only. ++ }]; ++ ++ let parameters = (ins ++ OptionalParameter<"MemoryScopeAttr">: $memory_scope, ++ OptionalParameter<"IntegerAttr", "1">: $chunk_size ++ ); ++ ++ let builders = [ ++ AttrBuilder<(ins ++ CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, ++ CArg<"int", "1">: $chunk_size ++ )> ++ ]; + } + + //===----------------------------------------------------------------------===// +diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +index 7111126f9c28..d3b38836b70b 100644 +--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td ++++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +@@ -403,33 +403,31 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { + is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, + implying each element in the array corresponds to a work-item (SIMT lane) + in the subgroup. +- * chunk_size: [optional attribute] indicates number of continious +- elements accessed for each offset, default is 1. ++ + + Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] + ``` + %a = memref.alloc() : memref<1024xf32> +- %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32> ++ %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32, chunk_size_per_lane = 1> + ``` + + Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. + It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] + ``` + %0 = memref.alloc() : memref<1024xf32> +- %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> ++ %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8> + ``` + + Example 3. It is similar to Example 2, but there is some overlaps among workitems. + It accesses: a[0:7], a[4:11], a[8:15], a[12:19] + ``` + %0 = memref.alloc() : memref<1024xf32> +- %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> ++ %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>> + ``` + }]; + + let arguments = (ins XeGPU_BaseAddrType: $source, +- XeGPU_OffsetType: $offsets, +- DefaultValuedAttr: $chunk_size); ++ XeGPU_OffsetType: $offsets); + let results = (outs XeGPU_TensorDesc:$TensorDesc); + + let assemblyFormat = [{ +diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +index 111a270a28b2..0c4dc11256d5 100644 +--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td ++++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +@@ -88,11 +88,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", + TypeBuilderWithInferredContext<(ins + "llvm::ArrayRef": $shape, + "mlir::Type": $elementType, +- CArg<"bool", "false">: $scattered, + CArg<"int", "1">: $array_length, +- CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, +- CArg<"bool", "true">: $boundary_check +- )> ++ CArg<"bool", "true">: $boundary_check, ++ CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope)>, ++ TypeBuilderWithInferredContext<(ins ++ "llvm::ArrayRef": $shape, ++ "mlir::Type": $elementType, ++ CArg<"int", "1">: $chunk_size, ++ CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope)> + ]; + + let extraClassDeclaration = [{ +@@ -110,40 +113,58 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", + return llvm::cast(cloneWith(getShape(), elementType)); + } + +- TensorDescAttr getEncodingAsTensorDescAttr() const { +- return llvm::dyn_cast_if_present(getEncoding()); ++ BlockTensorDescAttr getEncodingAsBlockTensorDescAttr() const { ++ return llvm::dyn_cast_if_present(getEncoding()); ++ } ++ ++ ScatterTensorDescAttr getEncodingAsScatterTensorDescAttr() const { ++ return llvm::dyn_cast_if_present(getEncoding()); + } + + xegpu::MemoryScope getMemoryScope() const { +- auto attr = getEncodingAsTensorDescAttr(); +- if (attr && attr.getMemoryScope()) +- return attr.getMemoryScope().getValue(); ++ auto block_attr = getEncodingAsBlockTensorDescAttr(); ++ if (block_attr && block_attr.getMemoryScope()) ++ return block_attr.getMemoryScope().getValue(); ++ ++ auto scatter_attr = getEncodingAsScatterTensorDescAttr(); ++ if (scatter_attr && scatter_attr.getMemoryScope()) ++ return scatter_attr.getMemoryScope().getValue(); ++ + // return default value + return MemoryScope::Global; + } + + int getArrayLength() { +- auto attr = getEncodingAsTensorDescAttr(); +- if (attr && attr.getArrayLength()) +- return attr.getArrayLength().getInt(); ++ auto attr = getEncoding(); ++ auto block_attr = mlir::dyn_cast_if_present(attr); ++ assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr."); ++ if (block_attr && block_attr.getArrayLength()) ++ return block_attr.getArrayLength().getInt(); + // return default value + return 1; + } + + bool getBoundaryCheck() { +- auto attr = getEncodingAsTensorDescAttr(); +- if (attr && attr.getBoundaryCheck()) +- return attr.getBoundaryCheck().getValue(); ++ auto attr = getEncoding(); ++ auto block_attr = mlir::dyn_cast_if_present(attr); ++ assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr."); ++ if (block_attr && block_attr.getBoundaryCheck()) ++ return block_attr.getBoundaryCheck().getValue(); + // return default value + return true; + } + +- bool getScattered() { +- auto attr = getEncodingAsTensorDescAttr(); +- if (attr && attr.getScattered()) +- return attr.getScattered().getValue(); +- // return default value +- return false; ++ bool isScattered() { ++ return bool(getEncodingAsScatterTensorDescAttr()); ++ } ++ ++ int getChunkSize() { ++ auto attr = getEncoding(); ++ auto scatter_attr = mlir::dyn_cast_if_present(attr); ++ assert((!attr || scatter_attr) && "invalid on non ScatterTensorDescAttr."); ++ if (scatter_attr && scatter_attr.getChunkSize()) ++ return scatter_attr.getChunkSize().getInt(); ++ return 1; + } + }]; + +diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +index 24719fe748fe..a5632c3fab8c 100644 +--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp ++++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +@@ -30,20 +30,31 @@ void XeGPUDialect::initialize() { + } + + //===----------------------------------------------------------------------===// +-// XeGPU_TensorDescAttr ++// XeGPU_BlockTensorDescAttr + //===----------------------------------------------------------------------===// +-TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context, +- xegpu::MemoryScope memory_scope, +- int array_length, bool boundary_check, +- bool scattered) { ++BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context, ++ xegpu::MemoryScope memory_scope, ++ int array_length, bool boundary_check) { + auto scopeAttr = MemoryScopeAttr::get(context, memory_scope); + auto lengthAttr = + IntegerAttr::get(IntegerType::get(context, 64), array_length); + auto boundaryAttr = BoolAttr::get(context, boundary_check); +- auto scatteredAttr = BoolAttr::get(context, scattered); +- return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr); ++ return Base::get(context, scopeAttr, lengthAttr, boundaryAttr); + } + ++//===----------------------------------------------------------------------===// ++// XeGPU_ScatterTensorDescAttr ++//===----------------------------------------------------------------------===// ++ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context, ++ xegpu::MemoryScope memory_scope, ++ int chunk_size) { ++ auto scopeAttr = MemoryScopeAttr::get(context, memory_scope); ++ auto chunkSizeAttr = ++ IntegerAttr::get(IntegerType::get(context, 64), chunk_size); ++ return Base::get(context, scopeAttr, chunkSizeAttr); ++} ++ ++ + //===----------------------------------------------------------------------===// + // XeGPU_TensorDescType + //===----------------------------------------------------------------------===// +@@ -108,12 +119,18 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const { + } + + TensorDescType TensorDescType::get(llvm::ArrayRef shape, +- mlir::Type elementType, bool scattered, +- int array_length, MemoryScope memory_scope, +- bool boundary_check) { ++ mlir::Type elementType, int array_length, ++ bool boundary_check, MemoryScope memory_scope) { ++ auto context = elementType.getContext(); ++ auto attr = BlockTensorDescAttr::get(context, memory_scope, array_length, boundary_check); ++ return Base::get(context, shape, elementType, attr); ++} ++ ++TensorDescType TensorDescType::get(llvm::ArrayRef shape, ++ mlir::Type elementType, int chunk_size, ++ MemoryScope memory_scope) { + auto context = elementType.getContext(); +- auto attr = TensorDescAttr::get(context, memory_scope, array_length, +- boundary_check, scattered); ++ auto attr = ScatterTensorDescAttr::get(context, memory_scope, chunk_size); + return Base::get(context, shape, elementType, attr); + } + +diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +index 2bdc87f36fa3..7591316d9fe1 100644 +--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp ++++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +@@ -149,7 +149,7 @@ LogicalResult CreateNdDescOp::verify() { + return emitOpError("TensorDesc should have the same element " + "type with the source if it is a memref.\n"); + +- if (getType().getScattered()) ++ if (getType().isScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); + + return success(); +@@ -160,7 +160,7 @@ LogicalResult CreateNdDescOp::verify() { + //===----------------------------------------------------------------------===// + LogicalResult PrefetchNdOp::verify() { + auto tdescTy = getTensorDescType(); +- if (tdescTy.getScattered()) ++ if (tdescTy.isScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); + + if (!isReadHintOrNone(getL1HintAttr())) +@@ -185,7 +185,7 @@ LogicalResult LoadNdOp::verify() { + if (tdescTy.getRank() > 2) + return emitOpError("Expecting a 1D/2D TensorDesc.\n"); + +- if (tdescTy.getScattered()) ++ if (tdescTy.isScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); + + if (!valueTy) +@@ -253,7 +253,7 @@ LogicalResult StoreNdOp::verify() { + if (dstTy.getRank() > 2) + return emitOpError("Expecting a 1D/2D TensorDesc.\n"); + +- if (dstTy.getScattered()) ++ if (dstTy.isScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); + + if (!valTy) +@@ -276,7 +276,7 @@ LogicalResult StoreNdOp::verify() { + //===----------------------------------------------------------------------===// + LogicalResult UpdateNdOffsetOp::verify() { + auto ty = getTensorDescType(); +- if (ty.getScattered()) ++ if (ty.isScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); + + // number of offsets specified must match the rank of the tensor descriptor +@@ -291,15 +291,16 @@ LogicalResult UpdateNdOffsetOp::verify() { + //===----------------------------------------------------------------------===// + LogicalResult CreateDescOp::verify() { + auto tdescTy = getTensorDescType(); +- auto chunkSize = getChunkSize(); + + if (getRankOf(getSource()) > 1) + return emitOpError( + "Expecting the source is a 1D memref or pointer (uint64_t)."); + +- if (!tdescTy.getScattered()) ++ if (!tdescTy.isScattered()) + return emitOpError("Expects a scattered TensorDesc.\n"); + ++ auto chunkSize = tdescTy.getChunkSize(); ++ + SmallVector shape({(int64_t)getNumOffsets()}); + if (chunkSize != 1) + shape.push_back(chunkSize); +@@ -317,7 +318,7 @@ LogicalResult CreateDescOp::verify() { + //===----------------------------------------------------------------------===// + LogicalResult PrefetchOp::verify() { + auto tdescTy = getTensorDescType(); +- if (!tdescTy.getScattered()) ++ if (!tdescTy.isScattered()) + return emitOpError("Expects a scattered TensorDesc.\n"); + + if (!isReadHintOrNone(getL1HintAttr())) +@@ -340,7 +341,7 @@ LogicalResult LoadGatherOp::verify() { + auto maskTy = getMaskType(); + auto valueTy = getValueType(); + +- if (!tdescTy.getScattered()) ++ if (!tdescTy.isScattered()) + return emitOpError("Expects a scattered TensorDesc.\n"); + + if (!isReadHintOrNone(getL1HintAttr())) +@@ -386,7 +387,7 @@ LogicalResult LoadGatherOp::verify() { + //===----------------------------------------------------------------------===// + LogicalResult StoreScatterOp::verify() { + auto tdescTy = getTensorDescType(); +- if (!tdescTy.getScattered()) ++ if (!tdescTy.isScattered()) + return emitOpError("Expects a scattered TensorDesc.\n"); + + if (!isWriteHintOrNone(getL1HintAttr())) +-- +2.34.1 diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp index bfee41a47..28bc8f4be 100644 --- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp +++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp @@ -1407,7 +1407,7 @@ struct XeGPUToVCPass : public ::imex::ConvertXeGPUToVCBase { typeConverter.addConversion( [&](xegpu::TensorDescType type) -> ::mlir::Type { - if (type.getScattered()) { + if (type.isScattered()) { return ::mlir::VectorType::get( 16, ::mlir::IndexType::get(&getContext())); } diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp index 8fc2fdf10..e23ffd143 100644 --- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp @@ -461,8 +461,7 @@ class SgInitTileOpPattern : public XeOneToNConversion { std::swap(offsetsX, offsetsY); auto tDescTy = mlir::xegpu::TensorDescType::get( - innerBlk, elemTy, false /*scattered*/, array_length, memoryScope, - true /*boundary_check*/); + innerBlk, elemTy, array_length, true /*boundary_check*/, memoryScope); auto createIndexConstant = [&](mlir::Type type, int64_t value) { auto attr = rewriter.getIndexAttr(value); diff --git a/test/Conversion/XeGPUToVC/atomiclsc.mlir b/test/Conversion/XeGPUToVC/atomiclsc.mlir index 58951cf1e..af2d8c0f4 100644 --- a/test/Conversion/XeGPUToVC/atomiclsc.mlir +++ b/test/Conversion/XeGPUToVC/atomiclsc.mlir @@ -22,14 +22,14 @@ module @gemm attributes {gpu.container_module} { // CHECK: %[[ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<4> : vector<16xindex> // CHECK: %[[OFFSETS_ADJUSTED:.*]] = arith.muli %[[ELEMENT_BYTEWIDTH]], %[[OFFSETS]] : vector<16xindex> // CHECK: %[[VEC_OFFSETS_APPLIED:.*]] = arith.addi %[[VEC_BASEPTR_SHUFFLED]], %[[OFFSETS_ADJUSTED]] : vector<16xindex> - %2 = xegpu.create_tdesc %arg0, %offsets {chunk_size = 1} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + %2 = xegpu.create_tdesc %arg0, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: %[[cst_3:.*]] = arith.constant dense : vector<16xi1> // CHECK: %[[cst_8:.*]] = arith.constant dense<0> : vector<16xi32> // CHECK: %[[SRC0:.*]] = vector.bitcast %[[cst_0]] : vector<16xf32> to vector<16xi32> // CHECK: %[[ATOMIC_RES:.*]] = func.call @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64({{.*}}, %[[VEC_OFFSETS_APPLIED]], %[[SRC0]], %[[cst_8]], {{.*}}, %[[cst_8]]) : ({{.*}}) -> vector<16xi32> // CHECK: %{{.*}} = vector.bitcast %[[ATOMIC_RES]] : vector<16xi32> to vector<16xf32> - %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1>, vector<16xf32> -> vector<16xf32> + %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> gpu.return } } diff --git a/test/Conversion/XeGPUToVC/loadgather.mlir b/test/Conversion/XeGPUToVC/loadgather.mlir index bcad0a2e1..7109d51dc 100644 --- a/test/Conversion/XeGPUToVC/loadgather.mlir +++ b/test/Conversion/XeGPUToVC/loadgather.mlir @@ -19,7 +19,7 @@ module @gemm attributes {gpu.container_module} { // CHECK: %[[IN_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex> // CHECK: %[[IN_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[IN_ELEMENT_BYTEWIDTH]], %[[OFFSET]] : vector<16xindex> // CHECK: %[[IN_PAYLOAD:.*]] = arith.addi %[[IN_PAYLOAD_BASEPTR_SHUFFLED]], %[[IN_ELEMENTWISE_OFFSET]] : vector<16xindex> - %tdesc_in = xegpu.create_tdesc %in, %offsets {chunk_size = 2} : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr> + %tdesc_in = xegpu.create_tdesc %in, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr> // CHECK: %[[OUT_EMPTY_PAYLOAD:.*]] = arith.constant dense<0> : vector<16xindex> // CHECK: %[[OUT_BASEPTR:.*]] = memref.extract_aligned_pointer_as_index {{.*}} : memref<32xf16> -> index @@ -28,16 +28,16 @@ module @gemm attributes {gpu.container_module} { // CHECK: %[[OUT_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex> // CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OFFSET]] : vector<16xindex> // CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex> - %tdesc_out = xegpu.create_tdesc %out_flat, %offsets {chunk_size = 2} : memref<32xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr> + %tdesc_out = xegpu.create_tdesc %out_flat, %offsets {chunk_size = 2} : memref<32xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr> // CHECK: %[[OLD:.*]] = arith.constant dense<0> : vector<16xi32> // CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v16i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> vector<16xi32> - %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16x2xf16> + %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xf16> // CHECK: %[[POST_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[LOAD_RES]] : vector<16xi32> to vector<32xf16> // CHECK: %[[PRE_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[POST_OP_ELEMENT_TYPE_CAST]] : vector<32xf16> to vector<16xi32> // CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v16i32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[PRE_OP_ELEMENT_TYPE_CAST]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> () - xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr>, vector<16xi1> + xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return } diff --git a/test/Conversion/XeGPUToVC/loadgather_dpas.mlir b/test/Conversion/XeGPUToVC/loadgather_dpas.mlir index 600888f0e..1e45fa2d4 100644 --- a/test/Conversion/XeGPUToVC/loadgather_dpas.mlir +++ b/test/Conversion/XeGPUToVC/loadgather_dpas.mlir @@ -19,11 +19,11 @@ module @gemm attributes {gpu.container_module} { // CHECK: %[[IN_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex> // CHECK: %[[IN_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[IN_ELEMENT_BYTEWIDTH]], %[[IN_OFFSET]] : vector<16xindex> // CHECK: %[[IN_PAYLOAD:.*]] = arith.addi %[[IN_PAYLOAD_BASEPTR_SHUFFLED]], %[[IN_ELEMENTWISE_OFFSET]] : vector<16xindex> - %0 = xegpu.create_tdesc %arg0, %offsets {chunk_size = 8} : memref<128xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr> + %0 = xegpu.create_tdesc %arg0, %offsets : memref<128xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr> // CHECK: %[[OLD:.*]] = arith.constant dense<0> : vector<64xi32> // CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v64i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<64xi32>) -> vector<64xi32> - %3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16x8xf16> + %3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x8xf16> // CHECK: %[[LOADA_v128f16:.*]] = vector.bitcast %[[LOAD_RES]] : vector<64xi32> to vector<128xf16> %66 = vector.shape_cast %3: vector<16x8xf16> to vector<128xf16> @@ -58,12 +58,12 @@ module @gemm attributes {gpu.container_module} { // CHECK: %[[OUT_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<4> : vector<16xindex> // CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OUT_OFFSET]] : vector<16xindex> // CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex> - %2 = xegpu.create_tdesc %arg2, %offsets2 {chunk_size = 8} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr> + %2 = xegpu.create_tdesc %arg2, %offsets2 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32> %8 = vector.shape_cast %7: vector<128xf32> to vector<16x8xf32> // CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v128f32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[C_ACC_v128f32]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<128xf32>) -> () - xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr>, vector<16xi1> + xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/reduction.mlir b/test/Conversion/XeTileToXeGPU/reduction.mlir index 302e1b6df..6c50907ed 100644 --- a/test/Conversion/XeTileToXeGPU/reduction.mlir +++ b/test/Conversion/XeTileToXeGPU/reduction.mlir @@ -10,10 +10,10 @@ module { %c0 = arith.constant 0 : index %acc = arith.constant dense<0.0> : vector<16xf16> //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16> - //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr> + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16> //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr> -> vector<16x32xf16> + //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> -> vector<16x32xf16> %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16> //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16> @@ -179,9 +179,9 @@ module { %r = vector.multi_reduction , %e, %acc [1] : vector<16x32xf16> to vector<16xf16> //CHECK: %[[R161:.*]] = vector.shape_cast %[[R160]] : vector<16xf16> to vector<2x8xf16> %c = vector.shape_cast %r: vector<16xf16> to vector<2x8xf16> - //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.tdesc_attr> + //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr> %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<2x8xf16> - //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.tdesc_attr> + //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr> xetile.store_tile %c, %s : vector<2x8xf16>, !xetile.tile<2x8xf16> gpu.return } @@ -193,10 +193,10 @@ module { %c0 = arith.constant 0 : index %acc = arith.constant dense<0.0> : vector<32xf16> //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16> - //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr> + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16> //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr> -> vector<16x32xf16> + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> -> vector<16x32xf16> %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16> //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16> //CHECK: %[[R3:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16> @@ -318,9 +318,9 @@ module { %r = vector.multi_reduction , %e, %acc [0] : vector<16x32xf16> to vector<32xf16> //CHECK: %[[R118:.*]] = vector.shape_cast %[[R117]] : vector<32xf16> to vector<4x8xf16> %c = vector.shape_cast %r: vector<32xf16> to vector<4x8xf16> - //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.tdesc_attr> + //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr> %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16> - //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.tdesc_attr> + //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr> xetile.store_tile %c, %s : vector<4x8xf16>, !xetile.tile<4x8xf16> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir index 3842da6ec..2040fd824 100644 --- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir @@ -25,95 +25,95 @@ gpu.module @test_kernel { //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index - //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index //CHECK: %[[r7:.*]] = arith.addi %[[r1]], %[[c32]] : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c48:.*]] = arith.constant 48 : index //CHECK: %[[r9:.*]] = arith.addi %[[r1]], %[[c48]] : index - //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index //CHECK: %[[r11:.*]] = arith.addi %[[r0]], %[[c8]] : index - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[r16:.*]] = arith.addi %[[r0]], %[[c16]] : index - //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r19:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r19:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index //CHECK: %[[r21:.*]] = arith.addi %[[r0]], %[[c24]] : index - //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r24:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r25:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r24:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r25:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[r26:.*]] = arith.addi %[[r0]], %[[c32]] : index - //CHECK: %[[r27:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r28:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r29:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r30:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r27:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r28:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r29:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r30:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c40:.*]] = arith.constant 40 : index //CHECK: %[[r31:.*]] = arith.addi %[[r0]], %[[c40]] : index - //CHECK: %[[r32:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r33:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r34:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r35:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r32:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r33:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r34:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r35:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[r36:.*]] = arith.addi %[[r0]], %[[c48]] : index - //CHECK: %[[r37:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r38:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r39:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r40:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r37:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r38:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r39:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r40:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c56:.*]] = arith.constant 56 : index //CHECK: %[[r41:.*]] = arith.addi %[[r0]], %[[c56]] : index - //CHECK: %[[r42:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r43:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r44:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r45:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r46:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r47:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r48:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r49:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r50:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r51:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r52:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r53:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r42:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r43:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r44:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r45:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r46:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r47:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r48:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r49:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r50:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r51:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r52:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r53:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<64x64xf32> - //CHECK: %[[r54:.*]] = xegpu.load_nd %[[r46]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r55:.*]] = xegpu.load_nd %[[r47]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r56:.*]] = xegpu.load_nd %[[r48]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r57:.*]] = xegpu.load_nd %[[r49]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r58:.*]] = xegpu.load_nd %[[r50]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r59:.*]] = xegpu.load_nd %[[r51]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r60:.*]] = xegpu.load_nd %[[r52]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r61:.*]] = xegpu.load_nd %[[r53]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r54:.*]] = xegpu.load_nd %[[r46]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r55:.*]] = xegpu.load_nd %[[r47]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r56:.*]] = xegpu.load_nd %[[r48]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r57:.*]] = xegpu.load_nd %[[r49]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r58:.*]] = xegpu.load_nd %[[r50]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r59:.*]] = xegpu.load_nd %[[r51]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r60:.*]] = xegpu.load_nd %[[r52]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r61:.*]] = xegpu.load_nd %[[r53]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<64x64xf32> -> vector<64x64xf32> - //CHECK: %[[r62:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r63:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r64:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r65:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + //CHECK: %[[r62:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r63:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r64:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r65:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16> - //CHECK: %[[r66:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r67:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r68:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r69:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + //CHECK: %[[r66:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r67:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r68:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r69:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16> //CHECK: %[[r72:.*]]:16 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[r62]], %[[arg5:.*]] = %[[r63]], %[[arg6:.*]] = %[[r64]], %[[arg7:.*]] = %[[r65]], %[[arg8:.*]] = %[[r66]], //CHECK-SAME: %[[arg9:.*]] = %[[r67]], %[[arg10:.*]] = %[[r68]], %[[arg11:.*]] = %[[r69]], %[[arg12:.*]] = %[[r54]], %[[arg13:.*]] = %[[r55]], //CHECK-SAME: %[[arg14:.*]] = %[[r56]], %[[arg15:.*]] = %[[r57]], %[[arg16:.*]] = %[[r58]], %[[arg17:.*]] = %[[r59]], %[[arg18:.*]] = %[[r60]], - //CHECK-SAME: %[[arg19:.*]] = %[[r61]]) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, + //CHECK-SAME: %[[arg19:.*]] = %[[r61]]) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, //CHECK-SAME: vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>) { %out:3 = scf.for %k = %c0 to %c1024 step %c64 iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value) @@ -152,16 +152,16 @@ gpu.module @test_kernel { //CHECK: %[[r208:.*]] = vector.extract_strided_slice %[[arg19]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - //CHECK: %[[r105:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r105:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r106:.*]] = vector.extract %[[r105]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r107:.*]] = vector.extract %[[r105]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r108:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r108:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r109:.*]] = vector.extract %[[r108]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r110:.*]] = vector.extract %[[r108]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r111:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r111:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r112:.*]] = vector.extract %[[r111]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r113:.*]] = vector.extract %[[r111]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r114:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r114:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r115:.*]] = vector.extract %[[r114]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r116:.*]] = vector.extract %[[r114]][1] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r117:.*]] = vector.extract_strided_slice %[[r106]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> @@ -198,16 +198,16 @@ gpu.module @test_kernel { //CHECK: %[[r148:.*]] = vector.extract_strided_slice %[[r116]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> %a_value = xetile.load_tile %a_tile : !xetile.tile<64x64xf16> -> vector<64x64xf16> - //CHECK: %[[r149:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + //CHECK: %[[r149:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> //CHECK: %[[r150:.*]] = vector.extract %[[r149]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[r151:.*]] = vector.extract %[[r149]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> - //CHECK: %[[r152:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + //CHECK: %[[r152:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> //CHECK: %[[r153:.*]] = vector.extract %[[r152]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[r154:.*]] = vector.extract %[[r152]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> - //CHECK: %[[r155:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + //CHECK: %[[r155:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> //CHECK: %[[r156:.*]] = vector.extract %[[r155]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[r157:.*]] = vector.extract %[[r155]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> - //CHECK: %[[r158:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + //CHECK: %[[r158:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> //CHECK: %[[r159:.*]] = vector.extract %[[r158]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[r160:.*]] = vector.extract %[[r158]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[r161:.*]] = vector.extract_strided_slice %[[r150]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xf16> to vector<8x16x2xf16> @@ -384,21 +384,21 @@ gpu.module @test_kernel { //CHECK: %[[r358:.*]] = vector.shuffle %[[r288]], %[[r304]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> //CHECK: %[[r359:.*]] = vector.shuffle %[[r320]], %[[r336]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> //CHECK: %[[r360:.*]] = vector.shuffle %[[r358]], %[[r359]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x16xf32>, vector<16x16xf32> - //CHECK: %[[r361:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r362:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r363:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r364:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r365:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r366:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r367:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - //CHECK: %[[r368:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + //CHECK: %[[r361:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r362:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r363:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r364:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r365:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r366:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r367:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r368:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c64] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16> %b_next_tile = xetile.update_tile_offset %b_tile, [%c64, %c0] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16> //CHECK: scf.yield %[[r361]], %[[r362]], %[[r363]], %[[r364]], %[[r365]], %[[r366]], %[[r367]], %[[r368]], %[[r339]], %[[r345]], %[[r351]], %[[r357]], %[[r342]], %[[r348]], %[[r354]], %[[r360]] - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, vector<32x16xf32>, vector<32x16xf32>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32>, //CHECK-SAME: vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32> scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<64x64xf16>, !xetile.tile<64x64xf16>, vector<64x64xf32> @@ -435,38 +435,38 @@ gpu.module @test_kernel { //CHECK: %[[r102:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r103:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r104:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - //CHECK: xegpu.store_nd %[[r73]], %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r77]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r81]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r85]], %[[r10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r74]], %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r78]], %[[r13]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r82]], %[[r14]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r86]], %[[r15]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r75]], %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r79]], %[[r18]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r83]], %[[r19]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r87]], %[[r20]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r76]], %[[r22]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r80]], %[[r23]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r84]], %[[r24]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r88]], %[[r25]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r89]], %[[r27]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r93]], %[[r28]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r97]], %[[r29]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r101]], %[[r30]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r90]], %[[r32]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r94]], %[[r33]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r98]], %[[r34]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r102]], %[[r35]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r91]], %[[r37]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r95]], %[[r38]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r99]], %[[r39]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r103]], %[[r40]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r92]], %[[r42]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r96]], %[[r43]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r100]], %[[r44]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r104]], %[[r45]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: xegpu.store_nd %[[r73]], %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r77]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r81]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r85]], %[[r10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r74]], %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r78]], %[[r13]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r82]], %[[r14]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r86]], %[[r15]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r75]], %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r79]], %[[r18]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r83]], %[[r19]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r87]], %[[r20]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r76]], %[[r22]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r80]], %[[r23]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r84]], %[[r24]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r88]], %[[r25]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r89]], %[[r27]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r93]], %[[r28]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r97]], %[[r29]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r101]], %[[r30]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r90]], %[[r32]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r94]], %[[r33]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r98]], %[[r34]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r102]], %[[r35]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r91]], %[[r37]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r95]], %[[r38]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r99]], %[[r39]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r103]], %[[r40]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r92]], %[[r42]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r96]], %[[r43]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r100]], %[[r44]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r104]], %[[r45]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %out#2, %c_init_tile: vector<64x64xf32>, !xetile.tile<64x64xf32> gpu.return diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir index 4dc97b4cc..2c7400432 100644 --- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir @@ -24,44 +24,44 @@ gpu.module @test_kernel { //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index - //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> + //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c8]] : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> - //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> //CHECK: %[[r10:.*]] = arith.addi %[[r0]], %[[c16]] : index - //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> + //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c24]] : index - //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> - //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> - //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.tdesc_attr> - //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.tdesc_attr> - //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi32, #xegpu.tdesc_attr> -> vector<32x16xi32> - //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi32, #xegpu.tdesc_attr> -> vector<32x16xi32> + //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> -> vector<32x16xi32> + //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> -> vector<32x16xi32> %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xi32> -> !xetile.tile<32x32xi32> %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<32x32xi32> -> vector<32x32xi32> - //CHECK: %20 = xegpu.create_nd_tdesc %[[arg0]][%2, %[[c0]]] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x32xi8, #xegpu.tdesc_attr> + //CHECK: %20 = xegpu.create_nd_tdesc %[[arg0]][%2, %[[c0]]] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xi8> -> !xetile.tile<32x32xi8> - //CHECK: %21 = xegpu.create_nd_tdesc %[[arg1]][%c0, %3] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x16xi8, #xegpu.tdesc_attr> + //CHECK: %21 = xegpu.create_nd_tdesc %[[arg1]][%c0, %3] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xi8> -> !xetile.tile<32x32xi8> %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value) -> (!xetile.tile<32x32xi8>, !xetile.tile<32x32xi8>, vector<32x32xi32>) { - //CHECK: %[[r39:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xi8, #xegpu.tdesc_attr> -> vector<32x32xi8> + //CHECK: %[[r39:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> -> vector<32x32xi8> //CHECK: %[[r40:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8> //CHECK: %[[r41:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [8, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8> //CHECK: %[[r42:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [16, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8> //CHECK: %[[r43:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [24, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8> %a_value = xetile.load_tile %a_tile : !xetile.tile<32x32xi8> -> vector<32x32xi8> - //CHECK: %[[r44:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xi8, #xegpu.tdesc_attr> -> vector<2x8x16x4xi8> + //CHECK: %[[r44:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> -> vector<2x8x16x4xi8> //CHECK: %[[r45:.*]] = vector.extract %[[r44]][0] : vector<8x16x4xi8> from vector<2x8x16x4xi8> //CHECK: %[[r46:.*]] = vector.extract %[[r44]][1] : vector<8x16x4xi8> from vector<2x8x16x4xi8> %b_value = xetile.load_tile %b_tile : !xetile.tile<32x32xi8> -> vector<32x32xi8> @@ -69,14 +69,14 @@ gpu.module @test_kernel { //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x32xi8>, vector<8x16x4xi8>, vector<8x16xi32> -> vector<8x16xi32> %c_new_value = xetile.tile_mma %a_value, %b_value, %c_value : vector<32x32xi8>, vector<32x32xi8>, vector<32x32xi32> -> vector<32x32xi32> - //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c0]], %[[c32]]] : !xegpu.tensor_desc<32x32xi8, #xegpu.tdesc_attr> - //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c32]], %[[c0]]] : !xegpu.tensor_desc<32x16xi8, #xegpu.tdesc_attr> + //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c0]], %[[c32]]] : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> + //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c32]], %[[c0]]] : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c32] : !xetile.tile<32x32xi8>, index, index -> !xetile.tile<32x32xi8> %b_next_tile = xetile.update_tile_offset %b_tile, [%c32, %c0] : !xetile.tile<32x32xi8>, index, index -> !xetile.tile<32x32xi8> scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<32x32xi8>, !xetile.tile<32x32xi8>, vector<32x32xi32> } - //CHECK-COUNT-8: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr> + //CHECK-COUNT-8: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> xetile.store_tile %out#2, %c_init_tile {innner_blocks = [8, 16]}: vector<32x32xi32>, !xetile.tile<32x32xi32> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir index 2e3e80358..8108c360c 100755 --- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir @@ -23,43 +23,43 @@ gpu.module @test_kernel { //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index - //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c8]] : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[r10:.*]] = arith.addi %[[r0]], %[[c16]] : index - //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c24]] : index - //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> %2 = xetile.init_tile %arg2[%0, %1] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32> - //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> %3 = xetile.load_tile %2 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x32xf32> -> vector<32x32xf32> - //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr> - //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c16]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr> + //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c16]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> %4 = xetile.init_tile %arg0[%0, %c0] : memref<1024x1024xtf32> -> !xetile.tile<32x32xtf32> - //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr> - //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r5]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr> + //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r5]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> %5 = xetile.init_tile %arg1[%c0, %1] : memref<1024x1024xtf32> -> !xetile.tile<32x32xtf32> //CHECK: %[[r24:.*]]:6 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[r20]], %[[arg5:.*]] = %[[r21]], %[[arg6:.*]] = %[[r22]], %[[arg7:.*]] = %[[r23]], %[[arg8:.*]] = %[[r18]], %[[arg9:.*]] = %[[r19]]) - //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> + //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> %6:3 = scf.for %arg3 = %c0 to %c1024 step %c64 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) -> (!xetile.tile<32x32xtf32>, !xetile.tile<32x32xtf32>, vector<32x32xf32>) { //CHECK: %[[r65:.*]] = vector.extract_strided_slice %[[arg8]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r66:.*]] = vector.extract_strided_slice %[[arg8]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> @@ -70,10 +70,10 @@ gpu.module @test_kernel { //CHECK: %[[r71:.*]] = vector.extract_strided_slice %[[arg9]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r72:.*]] = vector.extract_strided_slice %[[arg9]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - //CHECK: %[[r33:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr> -> vector<2x32x8xtf32> + //CHECK: %[[r33:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> -> vector<2x32x8xtf32> //CHECK: %[[r34:.*]] = vector.extract %[[r33]][0] : vector<32x8xtf32> from vector<2x32x8xtf32> //CHECK: %[[r35:.*]] = vector.extract %[[r33]][1] : vector<32x8xtf32> from vector<2x32x8xtf32> - //CHECK: %[[r36:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr> -> vector<2x32x8xtf32> + //CHECK: %[[r36:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> -> vector<2x32x8xtf32> //CHECK: %[[r37:.*]] = vector.extract %[[r36]][0] : vector<32x8xtf32> from vector<2x32x8xtf32> //CHECK: %[[r38:.*]] = vector.extract %[[r36]][1] : vector<32x8xtf32> from vector<2x32x8xtf32> //CHECK: %[[r39:.*]] = vector.extract_strided_slice %[[r34]] {offsets = [0, 0], sizes = [8, 8], strides = [1, 1]} : vector<32x8xtf32> to vector<8x8xtf32> @@ -94,8 +94,8 @@ gpu.module @test_kernel { //CHECK: %[[r54:.*]] = vector.extract_strided_slice %[[r38]] {offsets = [24, 0], sizes = [8, 8], strides = [1, 1]} : vector<32x8xtf32> to vector<8x8xtf32> %7 = xetile.load_tile %arg4 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x32xtf32> -> vector<32x32xtf32> - //CHECK: %[[r55:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr> -> vector<32x16xtf32> - //CHECK: %[[r56:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr> -> vector<32x16xtf32> + //CHECK: %[[r55:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> -> vector<32x16xtf32> + //CHECK: %[[r56:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> -> vector<32x16xtf32> //CHECK: %[[r57:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32> //CHECK: %[[r58:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32> //CHECK: %[[r59:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32> @@ -149,19 +149,19 @@ gpu.module @test_kernel { %9 = xetile.tile_mma %7, %8, %arg6 : vector<32x32xtf32>, vector<32x32xtf32>, vector<32x32xf32> -> vector<32x32xf32> - //CHECK: %[[r111:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr> - //CHECK: %[[r112:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr> + //CHECK: %[[r111:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r112:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> %10 = xetile.update_tile_offset %arg4, [%c0, %c64] : !xetile.tile<32x32xtf32>, index, index -> !xetile.tile<32x32xtf32> - //CHECK: %[[r113:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr> - //CHECK: %[[r114:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr> + //CHECK: %[[r113:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r114:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> %11 = xetile.update_tile_offset %arg5, [%c64, %c0] : !xetile.tile<32x32xtf32>, index, index -> !xetile.tile<32x32xtf32> //CHECK: scf.yield %[[r111]], %[[r112]], %[[r113]], %[[r114]], %[[r107]], %[[r110]] - //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> + //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> scf.yield %10, %11, %9 : !xetile.tile<32x32xtf32>, !xetile.tile<32x32xtf32>, vector<32x32xf32> } @@ -173,14 +173,14 @@ gpu.module @test_kernel { //CHECK: %[[r30:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r31:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r32:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - //CHECK: xegpu.store_nd %[[r25]], %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r29]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r26]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r30]], %[[r9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r27]], %[[r11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r31]], %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r28]], %[[r14]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r32]], %[[r15]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: xegpu.store_nd %[[r25]], %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r29]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r26]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r30]], %[[r9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r27]], %[[r11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r31]], %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r28]], %[[r14]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r32]], %[[r15]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %6#2, %2 : vector<32x32xf32>, !xetile.tile<32x32xf32> //CHECK: gpu.return gpu.return diff --git a/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir index c3f153db1..2fb7cc259 100644 --- a/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir @@ -8,10 +8,10 @@ gpu.module @test_kernel { //CHECK: %[[c64:.*]] = arith.constant 64 : index %c64 = arith.constant 64 : index //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] - //CHECK-SAME: memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK-SAME: memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - //CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> + //CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir b/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir index b5aed2a49..d9640d474 100755 --- a/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir @@ -56,7 +56,7 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp %26 = arith.muli %arg3, %c1024 : index %27 = arith.addi %26, %13 : index %28 = arith.addi %27, %16 : index - //CHECK: %{{.*}} = xegpu.create_nd_tdesc {{.*}} : memref<2048x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc {{.*}} : memref<2048x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %29 = xetile.init_tile %arg1[%28, %15] : memref<2048x12288xbf16> -> !xetile.tile<32x32xbf16> %30 = scf.for %arg4 = %c0 to %c2 step %c1 iter_args(%arg5 = %cst) -> (vector<1x4xf32>) { @@ -65,23 +65,23 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp %35 = arith.addi %34, %10 : index %36 = arith.addi %35, %11 : index - //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %37 = xetile.init_tile %arg0[%36, %9] : memref<16384x12288xbf16> -> !xetile.tile<32x32xbf16> %38:3 = scf.for %arg6 = %c0 to %c12288 step %c32 iter_args(%arg7 = %37, %arg8 = %29, %arg9 = %cst_0) -> (!xetile.tile<32x32xbf16>, !xetile.tile<32x32xbf16>, vector<32x32xf32>) { - //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> -> vector<2x32x16xbf16> + //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> //CHECK-COUNT-2: %{{.*}} = vector.extract %{{.*}} : vector<32x16xbf16> from vector<2x32x16xbf16> //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16> %48 = xetile.load_tile %arg7 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x32xbf16> -> vector<32x32xbf16> - //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xbf16> + //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xbf16> //CHECK-COUNT-2: %{{.*}} = vector.extract %{{.*}} : vector<16x16x2xbf16> from vector<2x16x16x2xbf16> //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16> %49 = xetile.load_tile %arg8 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x32xbf16> -> vector<32x32xbf16> - //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> - //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> + //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %50 = xetile.update_tile_offset %arg7, [%c0, %c32] : !xetile.tile<32x32xbf16>, index, index -> !xetile.tile<32x32xbf16> %51 = xetile.update_tile_offset %arg8, [%c0, %c32] : !xetile.tile<32x32xbf16>, index, index -> !xetile.tile<32x32xbf16> @@ -101,17 +101,17 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp %41 = vector.shape_cast %40 : vector<32xf32> to vector<1x32xf32> %alloc = memref.alloc() : memref<8x128xf32, 3> - //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x16xf32, #xegpu.tdesc_attr> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x16xf32, #xegpu.block_tdesc_attr> //CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : index - //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x16xf32, #xegpu.tdesc_attr> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x16xf32, #xegpu.block_tdesc_attr> %42 = xetile.init_tile %alloc[%17, %13] : memref<8x128xf32, 3> -> !xetile.tile<1x32xf32, #xetile.tile_attr> //CHECK-COUNT-2: vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32> - //CHECK-COUNT-2: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32, #xegpu.tdesc_attr> + //CHECK-COUNT-2: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %41, %42 : vector<1x32xf32>, !xetile.tile<1x32xf32, #xetile.tile_attr> - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.load_nd {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr> -> vector<8x4xf32> + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.load_nd {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> -> vector<8x4xf32> //CHECK-COUNT-8: vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32> //CHECK-COUNT-8: arith.addf %{{.*}}, %{{.*}} : vector<1x4xf32> %43 = xetile.init_tile %alloc[%21, %23] : memref<8x128xf32, 3> -> !xetile.tile<8x4xf32, #xetile.tile_attr> @@ -123,8 +123,8 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp } {lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array, step = 1 : index, syn.mm_dim = 0 : i64, syn.parall_level = 2 : i64, upperBoundMap = affine_map<() -> (2)>} //CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : index - //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<32x2048xf32> -> !xegpu.tensor_desc<1x4xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x4xf32>, !xegpu.tensor_desc<1x4xf32, #xegpu.tdesc_attr> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<32x2048xf32> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x4xf32>, !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> %31 = arith.addi %26, %16 : index %32 = xetile.init_tile %arg2[%25, %31] : memref<32x2048xf32> -> !xetile.tile<1x4xf32> xetile.store_tile %30, %32 : vector<1x4xf32>, !xetile.tile<1x4xf32> diff --git a/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir index 7f065147d..a452e0454 100644 --- a/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir @@ -12,19 +12,19 @@ gpu.module @test_kernel { %c64 = arith.constant 64 : index %c1024 = arith.constant 1024 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> - //CHECK: %[[r1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg3:.*]] = %[[r0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr>, vector<32x32xf16>) { + //CHECK: %[[r1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg3:.*]] = %[[r0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16>) { %nexta, %res = scf.for %k= %c0 to %c1024 step %c64 iter_args(%subA = %1, %subB = %cst) -> (!xetile.tile<32x32xf16>, vector<32x32xf16>) { - //CHECK: %[[r10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %3 = xetile.load_tile %subA : !xetile.tile<32x32xf16> -> vector<32x32xf16> - //CHECK: %[[r11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %5 = xetile.update_tile_offset %subA, [%c0, %c64]: !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16> - //CHECK: scf.yield %[[r11]], %[[r10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr>, vector<32x32xf16> + //CHECK: scf.yield %[[r11]], %[[r10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16> scf.yield %5, %3: !xetile.tile<32x32xf16>, vector<32x32xf16> } //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[r1]]#1 {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16> @@ -33,19 +33,19 @@ gpu.module @test_kernel { //CHECK: %[[r5:.*]] = vector.extract_strided_slice %[[r1]]#1 {offsets = [24, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16> - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index - //CHECK: %[[r7:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r7:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index - //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> %5 = xetile.init_tile %b[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> - //CHECK: xegpu.store_nd %[[r2]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r3]], %[[r7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r4]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[r5]], %[[r9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + //CHECK: xegpu.store_nd %[[r2]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r3]], %[[r7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r4]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r5]], %[[r9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> xetile.store_tile %res, %5: vector<32x32xf16>, !xetile.tile<32x32xf16> //CHECK: gpu.return diff --git a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir index 65899b751..58457381c 100644 --- a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir @@ -4,13 +4,13 @@ gpu.module @test_kernel { //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>) gpu.func @sglevel_softmax_dim_0(%a: memref<1024x1024xf16>) { //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16> - //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> - //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> @@ -31,12 +31,12 @@ gpu.module @test_kernel { //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>) gpu.func @sglevel_softmax_dim_1(%a: memref<1024x1024xf16>) { //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16> - //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> - //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> diff --git a/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir index b3ea44e70..07d7111f4 100644 --- a/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir @@ -12,28 +12,28 @@ gpu.module @test_kernel { %result = arith.constant dense<0.0>: vector<32x32xf32> //CHECK: %[[c0:.*]] = arith.constant 0 : index //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c48:.*]] = arith.constant 48 : index - //CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index - //CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index - //CHECK: %[[R8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[R9:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[R8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R9:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index - //CHECK: %[[R10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: %[[R11:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: %[[R10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R11:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32> - //CHECK: xegpu.store_nd %[[R0]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[R0]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[R1]], %[[R6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[R1]], %[[R7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[R2]], %[[R8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[R2]], %[[R9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[R3]], %[[R10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store_nd %[[R3]], %[[R11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + //CHECK: xegpu.store_nd %[[R0]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R0]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R1]], %[[R6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R1]], %[[R7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R2]], %[[R8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R2]], %[[R9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R3]], %[[R10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R3]], %[[R11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %result, %1: vector<32x32xf32>, !xetile.tile<32x32xf32> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir index d92ee037e..c99ec99bd 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir @@ -9,10 +9,10 @@ gpu.module @test_kernel { //CHECK: %[[c64:.*]] = arith.constant 64 : index %c64 = arith.constant 64 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> - //CHECK: %[[r1:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r1:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r2:.*]] = vector.extract %[[r1]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r3:.*]] = vector.extract %[[r1]][1] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r4:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> @@ -24,15 +24,15 @@ gpu.module @test_kernel { //CHECK: %[[r10:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> //CHECK: %[[r11:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16> - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c64]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c64]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %arg1[%[[c64]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %arg1[%[[c64]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %3 = xetile.init_tile %b[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16> - //CHECK: %[[r14:.*]] = xegpu.load_nd %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + //CHECK: %[[r14:.*]] = xegpu.load_nd %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> //CHECK: %[[r15:.*]] = vector.extract %[[r14]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[r16:.*]] = vector.extract %[[r14]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> - //CHECK: %[[r17:.*]] = xegpu.load_nd %[[r13]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + //CHECK: %[[r17:.*]] = xegpu.load_nd %[[r13]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> //CHECK: %[[r18:.*]] = vector.extract %[[r17]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[r19:.*]] = vector.extract %[[r17]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[r20:.*]] = vector.extract_strided_slice %[[r15]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xf16> to vector<8x16x2xf16> diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir index 5f336d335..ea32986ef 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir @@ -10,7 +10,7 @@ gpu.module @test_kernel { %3 = xetile.tile_unpack %2 { inner_blocks = [1, 16] } : vector<32x4x1x16xf16> -> vector<32x64xf16> %4 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> %5 = xetile.tile_pack %3 { inner_blocks = [1, 16] } : vector<32x64xf16> -> vector<32x4x1x16xf16> - // CHECK-COUNT-128: xegpu.store_nd %[[cst]], %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.tdesc_attr> + // CHECK-COUNT-128: xegpu.store_nd %[[cst]], %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> xetile.store_tile %5, %4 : vector<32x4x1x16xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr> gpu.return } @@ -87,7 +87,7 @@ gpu.module @test_kernel { %2 = xetile.tile_unpack %1 { inner_blocks = [1, 16] } : vector<32x4x1x16xf16> -> vector<32x64xf16> %3 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> %4 = xetile.tile_pack %2 { inner_blocks = [1, 16] } : vector<32x64xf16> -> vector<32x4x1x16xf16> - // CHECK-COUNT-128: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.tdesc_attr> + // CHECK-COUNT-128: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> xetile.store_tile %4, %3 : vector<32x4x1x16xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir index 04b2db65f..c7a56882c 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir @@ -9,11 +9,11 @@ gpu.module @test_kernel { %c64 = arith.constant 64 : index // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[%[[C0]], %[[C64]]] : memref<1024x1024xf16> - // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> + // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %1 = xetile.load_tile %0 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x32xf16, #xetile.tile_attr> -> vector<1x1x32x32xf16> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir index b9b7b49c4..0f15fa7ff 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir @@ -14,30 +14,30 @@ // CHECK: %[[c1024:.*]] = arith.constant 1024 : index %c1024 = arith.constant 1024 : index - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr> // CHECK: %[[R1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] - // CHECK-SAME: iter_args(%[[arg3:.*]] = %[[R0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr>, vector<32x32xf16>) + // CHECK-SAME: iter_args(%[[arg3:.*]] = %[[R0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16>) %1:2 = scf.for %arg2 = %c0 to %c1024 step %c64 iter_args(%arg3 = %0, %arg4 = %cst) -> (!xetile.tile<32x32xf16, #xetile.tile_attr>, vector<1x1x32x32xf16>) { - // CHECK: %[[R10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> + // CHECK: %[[R10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %5 = xetile.load_tile %arg3 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x32xf16, #xetile.tile_attr> -> vector<1x1x32x32xf16> - // CHECK: %[[R11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + // CHECK: %[[R11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %6 = xetile.update_tile_offset %arg3, [%c0, %c64] : !xetile.tile<32x32xf16, #xetile.tile_attr>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr> - // CHECK: scf.yield %[[R11]], %[[R10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr>, vector<32x32xf16> + // CHECK: scf.yield %[[R11]], %[[R10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16> scf.yield %6, %5 : !xetile.tile<32x32xf16, #xetile.tile_attr>, vector<1x1x32x32xf16> } - // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> // CHECK: %[[c8:.*]] = arith.constant 8 : index - // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> // CHECK: %[[c16:.*]] = arith.constant 16 : index - // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> // CHECK: %[[c24:.*]] = arith.constant 24 : index - // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> %2 = xetile.init_tile %arg1[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr> // CHECK: %[[R6:.*]] = vector.extract_strided_slice %[[R1]]#1 {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16> @@ -47,10 +47,10 @@ %3 = xetile.tile_unpack %1#1 { inner_blocks = [32, 32] } : vector<1x1x32x32xf16> -> vector<32x32xf16> %4 = xetile.tile_pack %3 { inner_blocks = [8, 32] } : vector<32x32xf16> -> vector<4x1x8x32xf16> - // CHECK: xegpu.store_nd %[[R6]], %[[R2]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> - // CHECK: xegpu.store_nd %[[R7]], %[[R3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> - // CHECK: xegpu.store_nd %[[R8]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> - // CHECK: xegpu.store_nd %[[R9]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr> + // CHECK: xegpu.store_nd %[[R6]], %[[R2]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: xegpu.store_nd %[[R7]], %[[R3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: xegpu.store_nd %[[R8]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: xegpu.store_nd %[[R9]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> xetile.store_tile %4, %2 : vector<4x1x8x32xf16>, !xetile.tile<32x32xf16, #xetile.tile_attr> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir index da41639bd..cd736f738 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir @@ -4,13 +4,13 @@ gpu.module @test_kernel { //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>) gpu.func @sglevel_softmax_dim_0(%arg0: memref<1024x1024xf16>) { //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> - //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> - //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %1 = xetile.load_tile %0 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x64xf16, #xetile.tile_attr> -> vector<1x2x32x32xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> @@ -35,12 +35,12 @@ gpu.module @test_kernel { //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>) gpu.func @sglevel_softmax_dim_1(%arg0: memref<1024x1024xf16>) { //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> + //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> - //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> - //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %1 = xetile.load_tile %0 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x64xf16, #xetile.tile_attr> -> vector<1x2x32x32xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir index 1ca2a5549..c6829fbfe 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir @@ -8,25 +8,25 @@ // CHECK: %[[c0:.*]] = arith.constant 0 : index // CHECK: %[[c32:.*]] = arith.constant 32 : index // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[c48:.*]] = arith.constant 48 : index // CHECK: %[[R1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[c8:.*]] = arith.constant 8 : index // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[c16:.*]] = arith.constant 16 : index // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[c24:.*]] = arith.constant 24 : index // CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32, #xetile.tile_attr> // CHECK: %[[R8:.*]] = vector.extract_strided_slice %[[cst]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> @@ -37,21 +37,21 @@ %2 = xetile.tile_pack %1 { inner_blocks = [8, 16] } : vector<32x32xf32> -> vector<4x2x8x16xf32> // CHECK: xegpu.store_nd %[[R8]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R8]], %[[R1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R9]], %[[R2]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R9]], %[[R3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R10]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R10]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R11]], %[[R6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R11]], %[[R7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %2, %0 : vector<4x2x8x16xf32>, !xetile.tile<32x32xf32, #xetile.tile_attr> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir index de76a2a62..a899e8773 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir @@ -8,24 +8,24 @@ gpu.module @test_kernel { // CHECK: %[[C64:.*]] = arith.constant 64 : index %c64 = arith.constant 64 : index - // CHECK: %[[REG0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[C0]], %[[C64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + // CHECK: %[[REG0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[C0]], %[[C64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr> - // CHECK: %[[REG1:.*]] = xegpu.load_nd %[[REG0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + // CHECK: %[[REG1:.*]] = xegpu.load_nd %[[REG0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> // CHECK: %[[REG2:.*]] = vector.extract %[[REG1]][0] : vector<32x16xf16> from vector<2x32x16xf16> // CHECK: %[[REG3:.*]] = vector.extract %[[REG1]][1] : vector<32x16xf16> from vector<2x32x16xf16> %1 = xetile.load_tile %0 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x32xf16, #xetile.tile_attr> -> vector<1x2x32x16xf16> - // CHECK: %[[REG4:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + // CHECK: %[[REG4:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // CHECK: %[[C32:.*]] = arith.constant 32 : index - // CHECK: %[[REG5:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + // CHECK: %[[REG5:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %2 = xetile.init_tile %arg1[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> - // CHECK: %[[REG6:.*]] = xegpu.load_nd %[[REG4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + // CHECK: %[[REG6:.*]] = xegpu.load_nd %[[REG4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> // CHECK: %[[REG7:.*]] = vector.extract %[[REG6]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> // CHECK: %[[REG8:.*]] = vector.extract %[[REG6]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> - // CHECK: %[[REG9:.*]] = xegpu.load_nd %[[REG5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + // CHECK: %[[REG9:.*]] = xegpu.load_nd %[[REG5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> // CHECK: %[[REG10:.*]] = vector.extract %[[REG9]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> // CHECK: %[[REG11:.*]] = vector.extract %[[REG9]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> %3 = xetile.load_tile %2 { padding = 0.000000e+00 : f32 } : !xetile.tile<32x64xf16, #xetile.tile_attr> -> vector<1x4x32x16xf16> diff --git a/test/Conversion/XeTileToXeGPU/test_order.mlir b/test/Conversion/XeTileToXeGPU/test_order.mlir index e5befc0a7..68a1b43ab 100644 --- a/test/Conversion/XeTileToXeGPU/test_order.mlir +++ b/test/Conversion/XeTileToXeGPU/test_order.mlir @@ -5,14 +5,14 @@ // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[C16:.*]] = arith.constant 16 : index // CHECK: %[[R_CAST:.*]] = memref.reinterpret_cast %[[ARG1]] to offset: [0], sizes: [128, 64], strides: [64, 1] : memref<64x128xf16, strided<[1, 64]>> to memref<128x64xf16> -// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C0]], %[[C0]]] : memref<128x64xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C16]], %[[C0]]] : memref<128x64xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -// CHECK: %[[T8:.*]] = xegpu.load_nd %[[T1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, transpose = array, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -> vector<8x16x2xf16> -// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T2]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, transpose = array, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -> vector<8x16x2xf16> -// CHECK: %[[T19:.*]] = xegpu.update_nd_offset %[[T1]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -// CHECK: %[[T20:.*]] = xegpu.update_nd_offset %[[T2]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -// CHECK: %[[T26:.*]] = xegpu.load_nd %[[T19]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, transpose = array, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -> vector<8x16x2xf16> -// CHECK: %[[T27:.*]] = xegpu.load_nd %[[T20]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, transpose = array, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -> vector<8x16x2xf16> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C0]], %[[C0]]] : memref<128x64xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C16]], %[[C0]]] : memref<128x64xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T8:.*]] = xegpu.load_nd %[[T1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, transpose = array, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<8x16x2xf16> +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T2]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, transpose = array, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<8x16x2xf16> +// CHECK: %[[T19:.*]] = xegpu.update_nd_offset %[[T1]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T20:.*]] = xegpu.update_nd_offset %[[T2]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T26:.*]] = xegpu.load_nd %[[T19]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, transpose = array, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<8x16x2xf16> +// CHECK: %[[T27:.*]] = xegpu.load_nd %[[T20]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, transpose = array, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<8x16x2xf16> gpu.module @test_kernel { func.func @test_func(%A : memref<128x64xf16>, %B : memref<64x128xf16, strided<[1, 64], offset: 0>>) { %c0 = arith.constant 0 : index diff --git a/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/test/Dialect/XeGPU/IR/XeGPUOps.mlir index 19954e667..09ea28a35 100644 --- a/test/Dialect/XeGPU/IR/XeGPUOps.mlir +++ b/test/Dialect/XeGPU/IR/XeGPUOps.mlir @@ -24,11 +24,10 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) { // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) { func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {chunk_size = 2 : i64} - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2} - : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> + // CHECK: xegpu.create_tdesc %{{.*}} : ui64, vector<16xindex> + // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> + -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir b/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir index d763e002b..c4bf4aea0 100644 --- a/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir +++ b/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir @@ -6,33 +6,33 @@ // CHECK-LABEL: func @test_atomic_rmw({{.*}}) { func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16xf32>, %mask : vector<16xi1>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.atomic_rmw - // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1>, vector<16xf32> - xegpu.atomic_rmw addf %1, %mask, %value : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1>, vector<16xf32> -> vector<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> + xegpu.atomic_rmw addf %1, %mask, %value : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> return } // CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) { func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16x2xi1>) { - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> // CHECK: xegpu.atomic_rmw - // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr>, vector<16x2xi1>, vector<16x2xf32> - xegpu.atomic_rmw mulf %1, %mask, %value : !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr>, vector<16x2xi1>, vector<16x2xf32> -> vector<16x2xf32> + // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16x2xi1>, vector<16x2xf32> + xegpu.atomic_rmw mulf %1, %mask, %value : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16x2xi1>, vector<16x2xf32> -> vector<16x2xf32> return } // CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) { func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16x2xi1>) { - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> // CHECK: xegpu.atomic_rmw - // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr>, vector<16x2xi1>, vector<16x2xi32> - xegpu.atomic_rmw andi %1, %mask, %value : !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr>, vector<16x2xi1>, vector<16x2xi32> -> vector<16x2xi32> + // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr>, vector<16x2xi1>, vector<16x2xi32> + xegpu.atomic_rmw andi %1, %mask, %value : !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr>, vector<16x2xi1>, vector<16x2xi32> -> vector<16x2xi32> return } diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir index 23e92b46e..e437622a6 100644 --- a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir +++ b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir @@ -65,9 +65,9 @@ func.func @test_create_nd_tdesc_4(%src: memref, %w : index, %h : index, func.func @test_create_nd_tdesc_5(%src: memref, %w : index, %h : index, %x : index, %y : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr>> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> return } @@ -75,9 +75,9 @@ func.func @test_create_nd_tdesc_5(%src: memref, %w : index, %h : index, func.func @test_create_nd_tdesc_6(%src: memref, %w : index, %h : index, %x : index, %y : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr>> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> return } @@ -94,9 +94,9 @@ func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) { func.func @test_create_nd_tdesc_8(%src: memref, %w : index, %h : index, %x : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr>> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> return } @@ -104,8 +104,8 @@ func.func @test_create_nd_tdesc_8(%src: memref, %w : index, %h : index, func.func @test_create_nd_tdesc_9(%src: memref, %w : index, %h : index, %x : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<64x128xf16, #xegpu.tdesc_attr>> + // CHECK-SAME: memref -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr>> %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] : memref - -> !xegpu.tensor_desc<64x128xf16, #xegpu.tdesc_attr> + -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir index 30a2ee66a..e24c15574 100644 --- a/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir +++ b/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir @@ -69,9 +69,9 @@ func.func @test_create_nd_tdesc_vc_5(%src: memref, %w : index, %h : ind %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1] - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } @@ -80,9 +80,9 @@ func.func @test_create_nd_tdesc_vc_6(%src: memref, %w : index, %h : ind %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1] - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } @@ -100,16 +100,16 @@ func.func @test_create_nd_tdesc_vc_7(%src: memref<1024xf32>, %offset : index) { func.func @test_create_nd_tdesc_vc_8(%src: memref, %w : index, %h : index, %x : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } // CHECK-LABEL: func @test_create_nd_tdesc_vc_9({{.*}}) { func.func @test_create_nd_tdesc_vc_9(%src: memref<8x32xf32>) { // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir index 245a37ba9..6f5b1a743 100644 --- a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir +++ b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir @@ -8,37 +8,35 @@ // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) { func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) { // CHECK: xegpu.create_tdesc %arg0, %arg1 - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> return } // CHECK-LABEL: func @test_create_tdesc_vc_2({{.*}}) { func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) { // CHECK: xegpu.create_tdesc %arg0, %arg1 - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> - -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> return } // CHECK-LABEL: func @test_create_tdesc_vc_3({{.*}}) { func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) { // CHECK: xegpu.create_tdesc %arg0, %arg1 - // CHECK-SAME: {chunk_size = 8 : i64} - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 8} - : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr> + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> + -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> return } // CHECK-LABEL: func @test_create_tdesc_vc_4({{.*}}) { func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc %arg0, %arg1 - // CHECK-SAME: {chunk_size = 2 : i64} - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2} - : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> + // CHECK: xegpu.create_tdesc %arg0, %arg1 : ui64, vector<16xindex> + // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> + -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> return } @@ -46,9 +44,9 @@ func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) { // CHECK-LABEL: func @test_create_tdesc_vc_5({{.*}}) { func.func @test_create_tdesc_vc_5(%src: memref, %offsets : vector<16 x index>) { // CHECK: xegpu.create_tdesc - // CHECK-SAME: {chunk_size = 2 : i64} - // CHECK-SAME: memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2} - : memref, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> + // CHECK-SAME: memref, vector<16xindex> + // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : memref, vector<16 x index> + -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/invalid_vc.mlir b/test/Dialect/XeGPU/IR/invalid_vc.mlir index db7526320..90b8887d2 100644 --- a/test/Dialect/XeGPU/IR/invalid_vc.mlir +++ b/test/Dialect/XeGPU/IR/invalid_vc.mlir @@ -50,21 +50,20 @@ func.func @test_create_nd_tdesc_vc_5(%input: memref<24x32x64xf32>) { func.func @test_create_tdesc(%src: ui64, %offsets : vector<16x8xindex>) { // expected-error@+1 {{operand #1 must be vector of index values of ranks 1}} %1 = xegpu.create_tdesc %src, %offsets - : ui64, vector<16x8xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr> + : ui64, vector<16x8xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<>> return } // ----- func.func @test_load_gather(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16x8xi1> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {chunk_size = 8} - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 8} - : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr> + // CHECK: xegpu.create_tdesc {{.*}} : ui64, vector<16xindex> + // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> + -> !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{failed to verify that all of {value, TensorDesc} have same rank}} %2 = xegpu.load %1, %0 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr>, vector<16x8xi1> -> vector<8x8x4xf16> + : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr>, vector<16x8xi1> -> vector<8x8x4xf16> return } diff --git a/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/test/Dialect/XeGPU/IR/load_gather_vc.mlir index a5722ad60..68b202c38 100644 --- a/test/Dialect/XeGPU/IR/load_gather_vc.mlir +++ b/test/Dialect/XeGPU/IR/load_gather_vc.mlir @@ -9,13 +9,13 @@ func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16xi1> //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex> - //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> - // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> return } @@ -23,15 +23,15 @@ func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) { func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16x8xi1> - //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} {chunk_size = 8 : i64} - //CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 8} - : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr> + //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64, vector<16xindex> + //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> + -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> - //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr>, vector<16x8xi1> -> vector<8x16xf32> + //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16x8xi1> -> vector<8x16xf32> %2 = xegpu.load %1, %0 {transpose = array, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr>, vector<16x8xi1> -> vector<8x16xf32> + : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16x8xi1> -> vector<8x16xf32> return } @@ -39,13 +39,13 @@ func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) { func.func @test_load_gather_vc_4(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16xi1> - //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 1} - : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> + -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> - //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> return } diff --git a/test/Dialect/XeGPU/IR/load_nd_vc.mlir b/test/Dialect/XeGPU/IR/load_nd_vc.mlir index 202c93d60..2d4b08d1d 100644 --- a/test/Dialect/XeGPU/IR/load_nd_vc.mlir +++ b/test/Dialect/XeGPU/IR/load_nd_vc.mlir @@ -56,15 +56,15 @@ func.func @test_load_nd_simd_bf16(%src: ui64, %w : index, %h : index, %x : index // CHECK-LABEL: func @test_load_nd_block_array_simd_f16({{.*}}) { func.func @test_load_nd_block_array_simd_f16(%src: memref<8x32xf16>) { // CHECK: xegpu.create_nd_tdesc %{{.*}}[0, 0] - // CHECK-SAME: memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> + // CHECK-SAME: memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %src[0, 0] - : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> + : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> // CHECK: xegpu.load_nd // CHECK-SAME: <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : - // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> -> vector<2x8x16xf16> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> -> vector<2x8x16xf16> %2 = xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> -> vector<2x8x16xf16> + : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> -> vector<2x8x16xf16> return } diff --git a/test/Dialect/XeGPU/IR/store_scatter.mlir b/test/Dialect/XeGPU/IR/store_scatter.mlir index 7e454867a..e41f46773 100644 --- a/test/Dialect/XeGPU/IR/store_scatter.mlir +++ b/test/Dialect/XeGPU/IR/store_scatter.mlir @@ -9,25 +9,25 @@ func.func @test_store_scatter(%src: ui64, %offsets : vector<16xindex>, %dst: ui6 %0 = arith.constant dense: vector<16xi1> // CHECK: xegpu.create_tdesc // CHECK-SAME: - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %1 = xegpu.create_tdesc %src, %offsets - : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.create_tdesc // CHECK-SAME: - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %2 = xegpu.create_tdesc %dst, %offsets - : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.load // CHECK-SAME: {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %3 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> // CHECK: xegpu.store // CHECK-SAME: {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> xegpu.store %3, %2, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> return } diff --git a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir index 493e54a6a..df304e739 100644 --- a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir +++ b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir @@ -8,24 +8,23 @@ func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) { %0 = arith.constant dense<1>: vector<16xi1> // CHECK: xegpu.create_tdesc - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src, %offsets - : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.create_tdesc - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %2 = xegpu.create_tdesc %dst, %offsets - : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.load // CHECK-SAME: {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %3 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> // CHECK: xegpu.store // CHECK-SAME: {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> xegpu.store %3, %2, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> return } diff --git a/test/Dialect/XeGPU/IR/update_offset_vc.mlir b/test/Dialect/XeGPU/IR/update_offset_vc.mlir index 78f25e71e..164319a83 100644 --- a/test/Dialect/XeGPU/IR/update_offset_vc.mlir +++ b/test/Dialect/XeGPU/IR/update_offset_vc.mlir @@ -8,22 +8,22 @@ func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) { %0 = arith.constant dense<1>: vector<16xi1> // CHECK: xegpu.create_tdesc - // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %1 = xegpu.create_tdesc %src, %offsets - : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.load // CHECK-SAME: {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %3 = arith.constant dense<16>: vector<16 x index> // CHECK: xegpu.update_offset - // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %5 = xegpu.update_offset %1, %3 - : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> return } diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir index ae7954564..6d1c6f55e 100644 --- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir +++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir @@ -150,12 +150,12 @@ module @gemm attributes {gpu.container_module} { // two 32x16 A tiles from 256x32 WG slice - %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %A_sg_init_tile_1 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c16] : memref<4096x4096xf16> //create B tiles - %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr< array_length = 2>> + %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %B_sg_init_tile_2 = xegpu.update_nd_offset %B_sg_init_tile_1, [%c0, %c16] : !xegpu.tensor_desc<32x16xf16> // %B_sg_init_tile_3 = xegpu.update_nd_offset %B_sg_init_tile_2, [%c0, %c16] : !xegpu.tensor_desc<32x16xf16> @@ -213,9 +213,9 @@ module @gemm attributes {gpu.container_module} { %A_prefetch_tile = %A_sg_prefetch_tile_iter3, %B_prefetch_tile = %B_sg_prefetch_tile_iter3 ) -> - (!xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, + (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>, !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16> ) @@ -228,13 +228,13 @@ module @gemm attributes {gpu.container_module} { xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier } // load A tiles - %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %a_val_0 = vector.extract %a_val [0] : vector<32x16xf16> from vector<2x32x16xf16> %a_val_1 = vector.extract %a_val [1] : vector<32x16xf16> from vector<2x32x16xf16> // load B tiles - %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> - %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> + %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> %b_val_0 = vector.extract %b_val_arr_0 [0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> %b_val_1 = vector.extract %b_val_arr_0 [1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> @@ -254,11 +254,11 @@ module @gemm attributes {gpu.container_module} { %next_A_prefetch_tile = xegpu.update_nd_offset %A_prefetch_tile, [%c0, %c32] : !xegpu.tensor_desc<8x32xf16> %next_B_prefetch_tile = xegpu.update_nd_offset %B_prefetch_tile, [%c32, %c0] : !xegpu.tensor_desc<8x32xf16> // advance A and B tiles - %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %next_A_tile_1 = xegpu.update_nd_offset %A_tile_1, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16> - %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %next_B_tile_2 = xegpu.update_nd_offset %B_tile_2, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16> // %next_B_tile_3 = xegpu.update_nd_offset %B_tile_3, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16> @@ -368,9 +368,9 @@ module @gemm attributes {gpu.container_module} { scf.yield %next_A_tile_0, %next_B_tile_0, %next_B_tile_1, %new_c_val_0_0, %new_c_val_0_1, %new_c_val_0_2, %new_c_val_0_3, %new_c_val_1_0, %new_c_val_1_1, %new_c_val_1_2, %new_c_val_1_3, %new_c_val_2_0, %new_c_val_2_1, %new_c_val_2_2, %new_c_val_2_3, %new_c_val_3_0, %new_c_val_3_1, %new_c_val_3_2, %new_c_val_3_3, %next_A_prefetch_tile, %next_B_prefetch_tile - : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, + : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>, !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16> } diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir index a65503775..0850e3f83 100644 --- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir +++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir @@ -150,12 +150,12 @@ module @gemm attributes {gpu.container_module} { // two 32x16 A tiles from 256x32 WG slice - %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %A_sg_init_tile_1 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c16] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16> //create B tiles - %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr< array_length = 2>> + %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %B_sg_init_tile_2 = xegpu.update_nd_offset %B_sg_init_tile_1, [%c0, %c16] : !xegpu.tensor_desc<32x16xf16> // %B_sg_init_tile_3 = xegpu.update_nd_offset %B_sg_init_tile_2, [%c0, %c16] : !xegpu.tensor_desc<32x16xf16> @@ -213,9 +213,9 @@ module @gemm attributes {gpu.container_module} { %A_prefetch_tile = %A_sg_prefetch_tile_iter3, %B_prefetch_tile = %B_sg_prefetch_tile_iter3 ) -> - (!xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, + (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>, !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16> ) @@ -228,13 +228,13 @@ module @gemm attributes {gpu.container_module} { xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier } // load A tiles - %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %a_val_0 = vector.extract %a_val [0] : vector<32x16xf16> from vector<2x32x16xf16> %a_val_1 = vector.extract %a_val [1] : vector<32x16xf16> from vector<2x32x16xf16> // load B tiles - %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> - %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> + %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> %b_val_0 = vector.extract %b_val_arr_0 [0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> %b_val_1 = vector.extract %b_val_arr_0 [1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> @@ -254,11 +254,11 @@ module @gemm attributes {gpu.container_module} { %next_A_prefetch_tile = xegpu.update_nd_offset %A_prefetch_tile, [%c0, %c32] : !xegpu.tensor_desc<8x32xf16> %next_B_prefetch_tile = xegpu.update_nd_offset %B_prefetch_tile, [%c32, %c0] : !xegpu.tensor_desc<8x32xf16> // advance A and B tiles - %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %next_A_tile_1 = xegpu.update_nd_offset %A_tile_1, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16> - %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %next_B_tile_2 = xegpu.update_nd_offset %B_tile_2, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16> // %next_B_tile_3 = xegpu.update_nd_offset %B_tile_3, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16> @@ -368,9 +368,9 @@ module @gemm attributes {gpu.container_module} { scf.yield %next_A_tile_0, %next_B_tile_0, %next_B_tile_1, %new_c_val_0_0, %new_c_val_0_1, %new_c_val_0_2, %new_c_val_0_3, %new_c_val_1_0, %new_c_val_1_1, %new_c_val_1_2, %new_c_val_1_3, %new_c_val_2_0, %new_c_val_2_1, %new_c_val_2_2, %new_c_val_2_3, %new_c_val_3_0, %new_c_val_3_1, %new_c_val_3_2, %new_c_val_3_3, %next_A_prefetch_tile, %next_B_prefetch_tile - : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, + : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>, !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16> } diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir index b65ed81c3..9de6ec890 100644 --- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir +++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir @@ -146,12 +146,12 @@ module @gemm attributes {gpu.container_module} { // two 32x16 A tiles from 256x32 WG slice - %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %A_sg_init_tile_1 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c16] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16> //create B tiles - %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr< array_length = 2>> + %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr< array_length = 2>> // %B_sg_init_tile_2 = xegpu.update_nd_offset %B_sg_init_tile_1, [%c0, %c16] : !xegpu.tensor_desc<32x16xf16> // %B_sg_init_tile_3 = xegpu.update_nd_offset %B_sg_init_tile_2, [%c0, %c16] : !xegpu.tensor_desc<32x16xf16> @@ -209,9 +209,9 @@ module @gemm attributes {gpu.container_module} { %A_prefetch_tile = %A_sg_prefetch_tile_iter3, %B_prefetch_tile = %B_sg_prefetch_tile_iter3 ) -> - (!xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, + (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>, !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16> ) @@ -224,13 +224,13 @@ module @gemm attributes {gpu.container_module} { xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier } // load A tiles - %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %a_val_0 = vector.extract %a_val [0] : vector<32x16xf16> from vector<2x32x16xf16> %a_val_1 = vector.extract %a_val [1] : vector<32x16xf16> from vector<2x32x16xf16> // load B tiles - %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> - %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xf16> + %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> + %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> %b_val_0 = vector.extract %b_val_arr_0 [0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> %b_val_1 = vector.extract %b_val_arr_0 [1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> @@ -250,11 +250,11 @@ module @gemm attributes {gpu.container_module} { %next_A_prefetch_tile = xegpu.update_nd_offset %A_prefetch_tile, [%c0, %c32] : !xegpu.tensor_desc<8x32xf16> %next_B_prefetch_tile = xegpu.update_nd_offset %B_prefetch_tile, [%c32, %c0] : !xegpu.tensor_desc<8x32xf16> // advance A and B tiles - %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %next_A_tile_1 = xegpu.update_nd_offset %A_tile_1, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16> - %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> - %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // %next_B_tile_2 = xegpu.update_nd_offset %B_tile_2, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16> // %next_B_tile_3 = xegpu.update_nd_offset %B_tile_3, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16> @@ -364,9 +364,9 @@ module @gemm attributes {gpu.container_module} { scf.yield %next_A_tile_0, %next_B_tile_0, %next_B_tile_1, %new_c_val_0_0, %new_c_val_0_1, %new_c_val_0_2, %new_c_val_0_3, %new_c_val_1_0, %new_c_val_1_1, %new_c_val_1_2, %new_c_val_1_3, %new_c_val_2_0, %new_c_val_2_1, %new_c_val_2_2, %new_c_val_2_3, %new_c_val_3_0, %new_c_val_3_1, %new_c_val_3_2, %new_c_val_3_3, %next_A_prefetch_tile, %next_B_prefetch_tile - : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, - !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr>, + : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>, !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16> } diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir index ad318ecd3..832cf06a2 100644 --- a/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir +++ b/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir @@ -19,8 +19,8 @@ module @gemm attributes {gpu.container_module} { } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<16x32xf16>, %arg1: memref<16x32xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<16x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> - %1 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr> -> vector<2x16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<16x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %1 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> %3 = arith.extf %1: vector<2x16x16xf16> to vector<2x16x16xf32> %4 = vector.extract %3[0]: vector<16x16xf32> from vector<2x16x16xf32> %5 = vector.extract %3[1]: vector<16x16xf32> from vector<2x16x16xf32> diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir index b672f457c..cd3f7ffc4 100644 --- a/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir +++ b/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir @@ -18,9 +18,9 @@ module @gemm attributes {gpu.container_module} { } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x32xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> + %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x32xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %1 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr> -> vector<2x32x16xf16> + : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %3 = arith.extf %1: vector<2x32x16xf16> to vector<2x32x16xf32> %4 = vector.extract %3[0]: vector<32x16xf32> from vector<2x32x16xf32> %5 = vector.extract %3[1]: vector<32x16xf32> from vector<2x32x16xf32> diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir index 72a06b2e6..1c3e463c8 100644 --- a/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir +++ b/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir @@ -19,10 +19,10 @@ module @gemm attributes {gpu.container_module} { } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<8x32xf16>, %arg1: memref<8x32xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> + %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> %1 = xegpu.create_nd_tdesc %arg1[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32> %2 = xegpu.create_nd_tdesc %arg1[0, 16] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32> - %3 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> -> vector<2x8x16xf16> + %3 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> -> vector<2x8x16xf16> %4 = vector.extract %3[0]: vector<8x16xf16> from vector<2x8x16xf16> %5 = vector.extract %3[1]: vector<8x16xf16> from vector<2x8x16xf16> %8 = arith.extf %4: vector<8x16xf16> to vector<8x16xf32> diff --git a/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir index 084a58c3f..1fd95155f 100644 --- a/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir @@ -44,21 +44,21 @@ module @gemm attributes {gpu.container_module} { // Spirv has no lowering for memref.reinterpret_cast with different sizes (doesn't work: memref<3x16xf32> to memref<16xf32>) // Each row has a tdesc with offsets that determine linearized memref's values to be loaded %offsets_row1 = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> - %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %row_1_store = arith.select %row_mask, %row_1_loaded, %user_val : vector<16xi1>, vector<16xf32> - xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> - %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %row_2_store = arith.select %row_mask, %row_2_loaded, %user_val : vector<16xi1>, vector<16xf32> - xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> // The entire row is out of bounds - %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/loadgather2d_masked_slm_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather2d_masked_slm_f32.mlir index 2f4f6c46f..4713f5cae 100644 --- a/test/Integration/Dialect/XeGPU/loadgather2d_masked_slm_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather2d_masked_slm_f32.mlir @@ -44,21 +44,21 @@ module @gemm attributes {gpu.container_module} { // Spirv has no lowering for memref.reinterpret_cast with different sizes (doesn't work: memref<3x16xf32> to memref<16xf32>) // Each row has a tdesc with offsets that determine linearized memref's values to be loaded %offsets_row1 = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> - %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16xf32> %row_1_store = arith.select %row_mask, %row_1_loaded, %user_val : vector<16xi1>, vector<16xf32> - xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> - %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> + %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16xf32> %row_2_store = arith.select %row_mask, %row_2_loaded, %user_val : vector<16xi1>, vector<16xf32> - xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> // The entire row is out of bounds - %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir index 575fc4c36..21c54e674 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir @@ -34,10 +34,10 @@ module @gemm attributes {gpu.container_module} { // Valid offsets (%offsets for which %mask is 1) should not exceed 16*2=32. %offsets = arith.constant dense<[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58]> : vector<16xindex> %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1> - %tdesc_in = xegpu.create_tdesc %in, %offsets {chunk_size = 2} : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> - %tdesc_out = xegpu.create_tdesc %out, %offsets {chunk_size = 2} : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> - %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16x2xf32> - xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr>, vector<16xi1> + %tdesc_in = xegpu.create_tdesc %in, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %tdesc_out = xegpu.create_tdesc %out, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xf32> + xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir index 2175f0e4b..c99ec4224 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir @@ -34,10 +34,10 @@ module @gemm attributes {gpu.container_module} { // Valid offsets (%offsets for which %mask is 1) should not exceed 16*2=32. %offsets = arith.constant dense<[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58]> : vector<16xindex> %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1> - %tdesc_in = xegpu.create_tdesc %in, %offsets {chunk_size = 2} : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr> - %tdesc_out = xegpu.create_tdesc %out, %offsets {chunk_size = 2} : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr> - %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16x2xi32> - xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr>, vector<16xi1> + %tdesc_in = xegpu.create_tdesc %in, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> + %tdesc_out = xegpu.create_tdesc %out, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> + %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xi32> + xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/loadgather_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_f32.mlir index 452d7d7e7..88e2cbf8c 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_f32.mlir @@ -28,10 +28,10 @@ module @gemm attributes {gpu.container_module} { %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> - %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> - xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> + xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir index e7dc1dd74..349576169 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir @@ -28,10 +28,10 @@ module @gemm attributes {gpu.container_module} { %mask = arith.constant dense<[1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1]> : vector<16xi1> %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> - %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> - xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> + xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/loadgather_masked_slm_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_masked_slm_f32.mlir index bf28c6afc..dc0686165 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_masked_slm_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_masked_slm_f32.mlir @@ -28,10 +28,10 @@ module @gemm attributes {gpu.container_module} { %mask = arith.constant dense<[1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1]> : vector<16xi1> %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> - %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> - xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16xf32> + xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/loadgather_slm_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_slm_f32.mlir index 4d43d67dc..6fdcdf149 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_slm_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_slm_f32.mlir @@ -29,10 +29,10 @@ module @gemm attributes {gpu.container_module} { %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> - %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -> vector<16xf32> - xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16xf32> + xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir index 5803b748c..60e7b5708 100644 --- a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir +++ b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir @@ -22,8 +22,8 @@ module @gemm attributes {gpu.container_module} { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index // load tile - %tile = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<32x16xf32> -> !xegpu.tensor_desc<32x8xf32, #xegpu.tdesc_attr> - %value = xegpu.load_nd %tile : !xegpu.tensor_desc<32x8xf32, #xegpu.tdesc_attr> -> vector<2x32x8xf32> + %tile = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<32x16xf32> -> !xegpu.tensor_desc<32x8xf32, #xegpu.block_tdesc_attr> + %value = xegpu.load_nd %tile : !xegpu.tensor_desc<32x8xf32, #xegpu.block_tdesc_attr> -> vector<2x32x8xf32> // extract the bottom 8x8 part of first 32x8 block %sub_tile0 = vector.extract_strided_slice %value { offsets = [0, 24], strides = [1, 1], sizes = [1, 8] } : vector<2x32x8xf32> to vector<1x8x8xf32> // extract the bottom 8x8 part of second 32x8 block diff --git a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir index 9d969b3d2..7758ce094 100644 --- a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir +++ b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir @@ -53,11 +53,11 @@ module { %26 = arith.muli %25, %c256 : index %27 = arith.divsi %15, %c32 : index %28 = arith.muli %27, %c32 : index - %29 = xegpu.create_nd_tdesc %arg0[%19, %15] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> + %29 = xegpu.create_nd_tdesc %arg0[%19, %15] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %30 = arith.divsi %23, %c32 : index %31 = arith.muli %30, %c32 : index %32 = arith.addi %26, %2 : index - %33 = xegpu.create_nd_tdesc %arg0[%32, %28] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> + %33 = xegpu.create_nd_tdesc %arg0[%32, %28] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> %34 = arith.remsi %11, %c4 : index %35 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) { %39 = vector.shape_cast %arg4 : vector<8x1xf32> to vector<8xf32> @@ -75,29 +75,29 @@ module { %50 = arith.addi %49, %24 : index %51 = arith.divsi %50, %c128 : index %52 = arith.muli %51, %c128 : index - %53 = xegpu.create_nd_tdesc %arg1[%50, %23] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %33 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - %54 = xegpu.update_nd_offset %33, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %54 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - %55 = xegpu.update_nd_offset %54, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %55 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - %56 = xegpu.update_nd_offset %55, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> + %53 = xegpu.create_nd_tdesc %arg1[%50, %23] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %33 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %54 = xegpu.update_nd_offset %33, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %54 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %55 = xegpu.update_nd_offset %54, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %55 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %56 = xegpu.update_nd_offset %55, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> %57 = arith.addi %52, %3 : index - %58 = xegpu.create_nd_tdesc %arg1[%57, %31] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %58 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - %59 = xegpu.update_nd_offset %58, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %59 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - %60 = xegpu.update_nd_offset %59, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %60 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - %61 = xegpu.update_nd_offset %60, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - %62:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %29, %arg7 = %53, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %56, %arg17 = %61, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr>, index) { + %58 = xegpu.create_nd_tdesc %arg1[%57, %31] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %58 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %59 = xegpu.update_nd_offset %58, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %59 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %60 = xegpu.update_nd_offset %59, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %60 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %61 = xegpu.update_nd_offset %60, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %62:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %29, %arg7 = %53, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %56, %arg17 = %61, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index) { %391 = arith.cmpi eq, %arg18, %c21 : index %392 = arith.select %391, %c0, %arg18 : index scf.if %391 { gpu.barrier } %393 = arith.addi %392, %c1 : index - %394 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> -> vector<2x32x16xbf16> + %394 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> %395 = vector.shape_cast %394 : vector<2x32x16xbf16> to vector<1024xbf16> %396 = vector.shuffle %395, %395 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16> %397 = vector.shuffle %395, %395 [512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023] : vector<1024xbf16>, vector<1024xbf16> @@ -117,7 +117,7 @@ module { %411 = vector.shape_cast %410 : vector<128xbf16> to vector<8x16xbf16> %412 = vector.shuffle %397, %397 [384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<512xbf16>, vector<512xbf16> %413 = vector.shape_cast %412 : vector<128xbf16> to vector<8x16xbf16> - %414 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xbf16> + %414 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xbf16> %415 = vector.shape_cast %414 : vector<2x16x16x2xbf16> to vector<1024xbf16> %416 = vector.shuffle %415, %415 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16> %417 = vector.shuffle %415, %415 [512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023] : vector<1024xbf16>, vector<1024xbf16> @@ -130,15 +130,15 @@ module { %424 = vector.shuffle %417, %417 [256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<512xbf16>, vector<512xbf16> %425 = vector.shape_cast %424 : vector<256xbf16> to vector<8x16x2xbf16> xegpu.compile_hint - xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> + xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint - %426 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - %427 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> + %426 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %427 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint xegpu.compile_hint - %428 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> - %429 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> + %428 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + %429 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint %430 = xegpu.dpas %399, %419, %arg8 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> %431 = xegpu.dpas %407, %421, %430 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> @@ -157,7 +157,7 @@ module { %444 = xegpu.dpas %405, %423, %arg15 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> %445 = xegpu.dpas %413, %425, %444 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> xegpu.compile_hint - scf.yield %428, %429, %431, %433, %435, %437, %439, %441, %443, %445, %426, %427, %393 : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr>, index + scf.yield %428, %429, %431, %433, %435, %437, %439, %441, %443, %445, %426, %427, %393 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index } %63 = vector.shape_cast %62#2 : vector<8x16xf32> to vector<128xf32> %64 = vector.shape_cast %62#3 : vector<8x16xf32> to vector<128xf32> @@ -419,13 +419,13 @@ module { %320 = arith.addf %318, %319 : vector<16xf32> %321 = vector.shuffle %317, %320 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32> %alloc = memref.alloc() : memref<256x4xf32, #spirv.storage_class> - %322 = xegpu.create_nd_tdesc %alloc[%13, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %322 = xegpu.create_nd_tdesc %alloc[%13, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %323 = arith.addi %13, %c8 : index - %324 = xegpu.create_nd_tdesc %alloc[%323, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %324 = xegpu.create_nd_tdesc %alloc[%323, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %325 = arith.addi %13, %c16 : index - %326 = xegpu.create_nd_tdesc %alloc[%325, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %326 = xegpu.create_nd_tdesc %alloc[%325, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %327 = arith.addi %13, %c24 : index - %328 = xegpu.create_nd_tdesc %alloc[%327, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %328 = xegpu.create_nd_tdesc %alloc[%327, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %329 = vector.shuffle %321, %321 [0, 1, 2, 3, 4, 5, 6, 7] : vector<32xf32>, vector<32xf32> %330 = vector.shape_cast %329 : vector<8xf32> to vector<8x1xf32> %331 = vector.shuffle %321, %321 [8, 9, 10, 11, 12, 13, 14, 15] : vector<32xf32>, vector<32xf32> @@ -434,13 +434,13 @@ module { %334 = vector.shape_cast %333 : vector<8xf32> to vector<8x1xf32> %335 = vector.shuffle %321, %321 [24, 25, 26, 27, 28, 29, 30, 31] : vector<32xf32>, vector<32xf32> %336 = vector.shape_cast %335 : vector<8xf32> to vector<8x1xf32> - xegpu.store_nd %330, %322 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> - xegpu.store_nd %332, %324 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> - xegpu.store_nd %334, %326 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> - xegpu.store_nd %336, %328 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + xegpu.store_nd %330, %322 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %332, %324 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %334, %326 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %336, %328 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> gpu.barrier - %337 = xegpu.create_nd_tdesc %alloc[%9, %c0] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr> - %338 = xegpu.load_nd %337 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr> -> vector<8x4xf32> + %337 = xegpu.create_nd_tdesc %alloc[%9, %c0] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> + %338 = xegpu.load_nd %337 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> -> vector<8x4xf32> %339 = vector.shape_cast %338 : vector<8x4xf32> to vector<32xf32> %340 = vector.shuffle %339, %339 [0, 1, 2, 3] : vector<32xf32>, vector<32xf32> %341 = vector.shuffle %339, %339 [4, 5, 6, 7] : vector<32xf32>, vector<32xf32> @@ -499,8 +499,8 @@ module { } %36 = arith.addi %16, %18 : index %37 = arith.addi %36, %9 : index - %38 = xegpu.create_nd_tdesc %arg2[%37, %7] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> - xegpu.store_nd %35, %38 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %38 = xegpu.create_nd_tdesc %arg2[%37, %7] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %35, %38 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> gpu.return } } diff --git a/test/Transforms/VectorLinearize/postop_reduce_n.mlir b/test/Transforms/VectorLinearize/postop_reduce_n.mlir index 4074eff3d..09f28d414 100644 --- a/test/Transforms/VectorLinearize/postop_reduce_n.mlir +++ b/test/Transforms/VectorLinearize/postop_reduce_n.mlir @@ -56,13 +56,13 @@ module { %28 = arith.muli %27, %c32 : index %29 = arith.addi %19, %c0 : index %30 = arith.addi %15, %c0 : index - %31 = xegpu.create_nd_tdesc %arg0[%29, %30] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> + %31 = xegpu.create_nd_tdesc %arg0[%29, %30] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %32 = arith.divsi %23, %c32 : index %33 = arith.muli %32, %c32 : index %34 = arith.addi %26, %2 : index %35 = arith.addi %34, %c0 : index %36 = arith.addi %28, %c0 : index - %37 = xegpu.create_nd_tdesc %arg0[%35, %36] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> + %37 = xegpu.create_nd_tdesc %arg0[%35, %36] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> %38 = arith.remsi %11, %c4 : index %39 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) { @@ -89,31 +89,31 @@ module { %57 = arith.muli %56, %c128 : index %58 = arith.addi %55, %c0 : index %59 = arith.addi %23, %c0 : index - %60 = xegpu.create_nd_tdesc %arg1[%58, %59] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %37 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - %61 = xegpu.update_nd_offset %37, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %61 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - %62 = xegpu.update_nd_offset %61, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %62 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - %63 = xegpu.update_nd_offset %62, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> + %60 = xegpu.create_nd_tdesc %arg1[%58, %59] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %37 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %61 = xegpu.update_nd_offset %37, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %61 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %62 = xegpu.update_nd_offset %61, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %62 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %63 = xegpu.update_nd_offset %62, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> %64 = arith.addi %57, %3 : index %65 = arith.addi %64, %c0 : index %66 = arith.addi %33, %c0 : index - %67 = xegpu.create_nd_tdesc %arg1[%65, %66] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %67 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - %68 = xegpu.update_nd_offset %67, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %68 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - %69 = xegpu.update_nd_offset %68, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %69 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - %70 = xegpu.update_nd_offset %69, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> - %71:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %31, %arg7 = %60, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %63, %arg17 = %70, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr>, index) { + %67 = xegpu.create_nd_tdesc %arg1[%65, %66] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %67 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %68 = xegpu.update_nd_offset %67, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %68 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %69 = xegpu.update_nd_offset %68, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %69 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %70 = xegpu.update_nd_offset %69, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %71:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %31, %arg7 = %60, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %63, %arg17 = %70, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index) { %437 = arith.cmpi eq, %arg18, %c21 : index %438 = arith.select %437, %c0, %arg18 : index scf.if %437 { gpu.barrier } %439 = arith.addi %438, %c1 : index - %440 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> -> vector<2x32x16xbf16> + %440 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> //CHECK: vector.shape_cast %{{.*}} : vector<2x32x16xbf16> to vector<1024xbf16> //CHECK: vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16> @@ -144,7 +144,7 @@ module { %448 = vector.extract_strided_slice %442 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16> %449 = vector.extract_strided_slice %442 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16> %450 = vector.extract_strided_slice %442 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16> - %451 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> -> vector<2x16x16x2xbf16> + %451 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xbf16> //CHECK: vector.shape_cast %{{.*}} : vector<2x16x16x2xbf16> to vector<1024xbf16> //CHECK: vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16> @@ -164,15 +164,15 @@ module { %456 = vector.extract_strided_slice %453 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16> %457 = vector.extract_strided_slice %453 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16> xegpu.compile_hint - xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> + xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint - %458 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr> - %459 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr> + %458 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %459 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint xegpu.compile_hint - %460 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> - %461 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr> + %460 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + %461 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint %462 = xegpu.dpas %443, %454, %arg8 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> %463 = xegpu.dpas %447, %455, %462 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> @@ -191,7 +191,7 @@ module { %476 = xegpu.dpas %446, %456, %arg15 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> %477 = xegpu.dpas %450, %457, %476 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> xegpu.compile_hint - scf.yield %460, %461, %463, %465, %467, %469, %471, %473, %475, %477, %458, %459, %439 : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr>, index + scf.yield %460, %461, %463, %465, %467, %469, %471, %473, %475, %477, %458, %459, %439 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index } //CHECK-COUNT-8: vector.shape_cast %{{.*}} : vector<8x16xf32> to vector<128xf32> @@ -489,27 +489,27 @@ module { %alloc = memref.alloc() : memref<256x4xf32, #spirv.storage_class> %359 = arith.addi %13, %c0 : index %360 = arith.addi %38, %c0 : index - %361 = xegpu.create_nd_tdesc %alloc[%359, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %361 = xegpu.create_nd_tdesc %alloc[%359, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %362 = arith.addi %13, %c8 : index - %363 = xegpu.create_nd_tdesc %alloc[%362, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %363 = xegpu.create_nd_tdesc %alloc[%362, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %c16 = arith.constant 16 : index %364 = arith.addi %13, %c16 : index - %365 = xegpu.create_nd_tdesc %alloc[%364, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %365 = xegpu.create_nd_tdesc %alloc[%364, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %c24 = arith.constant 24 : index %366 = arith.addi %13, %c24 : index - %367 = xegpu.create_nd_tdesc %alloc[%366, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %367 = xegpu.create_nd_tdesc %alloc[%366, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %368 = vector.extract_strided_slice %358 {offsets = [0, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32> %369 = vector.extract_strided_slice %358 {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32> %370 = vector.extract_strided_slice %358 {offsets = [16, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32> %371 = vector.extract_strided_slice %358 {offsets = [24, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32> - xegpu.store_nd %368, %361 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> - xegpu.store_nd %369, %363 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> - xegpu.store_nd %370, %365 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> - xegpu.store_nd %371, %367 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + xegpu.store_nd %368, %361 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %369, %363 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %370, %365 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %371, %367 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> gpu.barrier %372 = arith.addi %9, %c0 : index - %373 = xegpu.create_nd_tdesc %alloc[%372, %c0] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr> - %374 = xegpu.load_nd %373 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr> -> vector<8x4xf32> + %373 = xegpu.create_nd_tdesc %alloc[%372, %c0] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> + %374 = xegpu.load_nd %373 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> -> vector<8x4xf32> %375 = vector.extract_strided_slice %374 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32> %376 = vector.extract_strided_slice %374 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32> %377 = vector.extract_strided_slice %374 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32> @@ -591,8 +591,8 @@ module { %41 = arith.addi %40, %9 : index %42 = arith.addi %41, %c0 : index %43 = arith.addi %7, %c0 : index - %44 = xegpu.create_nd_tdesc %arg2[%42, %43] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> - xegpu.store_nd %39, %44 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr> + %44 = xegpu.create_nd_tdesc %arg2[%42, %43] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %39, %44 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> gpu.return } }