Move chunk_size into TensorDesc attribute

intel · Aug 2, 2024 · f48f957 · f48f957
1 parent 4e86e50
commit f48f957
Show file tree

Hide file tree

Showing 51 changed files with 963 additions and 573 deletions.
diff --git a/build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch b/build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch
diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
@@ -1407,7 +1407,7 @@ struct XeGPUToVCPass : public ::imex::ConvertXeGPUToVCBase<XeGPUToVCPass> {
 
     typeConverter.addConversion(
         [&](xegpu::TensorDescType type) -> ::mlir::Type {
-          if (type.getScattered()) {
+          if (type.isScattered()) {
             return ::mlir::VectorType::get(
                 16, ::mlir::IndexType::get(&getContext()));
           }

diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
@@ -457,8 +457,8 @@ class SgInitTileOpPattern : public XeOneToNConversion<xetile::InitTileOp> {
       std::swap(offsetsX, offsetsY);
 
     auto tDescTy = mlir::xegpu::TensorDescType::get(
-        innerBlk, elemTy, false /*scattered*/, array_length,
-        mlir::xegpu::MemoryScope::Global, true /*boundary_check*/);
+        innerBlk, elemTy, array_length, true /*boundary_check*/,
+        mlir::xegpu::MemoryScope::Global);
 
     auto createIndexConstant = [&](mlir::Type type, int64_t value) {
       auto attr = rewriter.getIndexAttr(value);

diff --git a/test/Conversion/XeGPUToVC/atomiclsc.mlir b/test/Conversion/XeGPUToVC/atomiclsc.mlir
@@ -22,14 +22,14 @@ module @gemm attributes {gpu.container_module} {
       // CHECK: %[[ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<4> : vector<16xindex>
       // CHECK: %[[OFFSETS_ADJUSTED:.*]] = arith.muli %[[ELEMENT_BYTEWIDTH]], %[[OFFSETS]] : vector<16xindex>
       // CHECK: %[[VEC_OFFSETS_APPLIED:.*]] = arith.addi %[[VEC_BASEPTR_SHUFFLED]], %[[OFFSETS_ADJUSTED]] : vector<16xindex>
-      %2 = xegpu.create_tdesc %arg0, %offsets {chunk_size = 1} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+      %2 = xegpu.create_tdesc %arg0, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
       // CHECK: %[[cst_3:.*]] = arith.constant dense<true> : vector<16xi1>
       // CHECK: %[[cst_8:.*]] = arith.constant dense<0> : vector<16xi32>
       // CHECK: %[[SRC0:.*]] = vector.bitcast %[[cst_0]] : vector<16xf32> to vector<16xi32>
       // CHECK: %[[ATOMIC_RES:.*]] = func.call @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64({{.*}}, %[[VEC_OFFSETS_APPLIED]], %[[SRC0]], %[[cst_8]], {{.*}}, %[[cst_8]]) : ({{.*}}) -> vector<16xi32>
       // CHECK: %{{.*}} = vector.bitcast %[[ATOMIC_RES]] : vector<16xi32> to vector<16xf32>
-      %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+      %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
       gpu.return
     }
  }

diff --git a/test/Conversion/XeGPUToVC/loadgather.mlir b/test/Conversion/XeGPUToVC/loadgather.mlir
@@ -19,7 +19,7 @@ module @gemm attributes {gpu.container_module} {
       // CHECK: %[[IN_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
       // CHECK: %[[IN_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[IN_ELEMENT_BYTEWIDTH]], %[[OFFSET]] : vector<16xindex>
       // CHECK: %[[IN_PAYLOAD:.*]] = arith.addi %[[IN_PAYLOAD_BASEPTR_SHUFFLED]], %[[IN_ELEMENTWISE_OFFSET]] : vector<16xindex>
-      %tdesc_in = xegpu.create_tdesc %in, %offsets {chunk_size = 2} : memref<?xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>
+      %tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
 
       // CHECK: %[[OUT_EMPTY_PAYLOAD:.*]] = arith.constant dense<0> : vector<16xindex>
       // CHECK: %[[OUT_BASEPTR:.*]] = memref.extract_aligned_pointer_as_index {{.*}} : memref<32xf16> -> index
@@ -28,16 +28,16 @@ module @gemm attributes {gpu.container_module} {
       // CHECK: %[[OUT_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
       // CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OFFSET]] : vector<16xindex>
       // CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex>
-      %tdesc_out = xegpu.create_tdesc %out_flat, %offsets {chunk_size = 2} : memref<32xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>
+      %tdesc_out = xegpu.create_tdesc %out_flat, %offsets {chunk_size = 2} : memref<32xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
 
       // CHECK: %[[OLD:.*]] =  arith.constant dense<0> : vector<16xi32>
       // CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v16i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> vector<16xi32>
-      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16x2xf16>
+      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xf16>
       // CHECK: %[[POST_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[LOAD_RES]] : vector<16xi32> to vector<32xf16>
 
       // CHECK: %[[PRE_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[POST_OP_ELEMENT_TYPE_CAST]] : vector<32xf16> to vector<16xi32>
       // CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v16i32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[PRE_OP_ELEMENT_TYPE_CAST]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> ()
-      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
 
       gpu.return
     }

diff --git a/test/Conversion/XeGPUToVC/loadgather_dpas.mlir b/test/Conversion/XeGPUToVC/loadgather_dpas.mlir
@@ -19,11 +19,11 @@ module @gemm attributes {gpu.container_module} {
          // CHECK: %[[IN_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
          // CHECK: %[[IN_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[IN_ELEMENT_BYTEWIDTH]], %[[IN_OFFSET]] : vector<16xindex>
          // CHECK: %[[IN_PAYLOAD:.*]] = arith.addi %[[IN_PAYLOAD_BASEPTR_SHUFFLED]], %[[IN_ELEMENTWISE_OFFSET]] : vector<16xindex>
-         %0 = xegpu.create_tdesc %arg0, %offsets {chunk_size = 8} : memref<128xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr<scattered = true>>
+         %0 = xegpu.create_tdesc %arg0, %offsets : memref<128xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
 
          // CHECK: %[[OLD:.*]] =  arith.constant dense<0> : vector<64xi32>
          // CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v64i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<64xi32>) -> vector<64xi32>
-         %3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16x8xf16>
+         %3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1> -> vector<16x8xf16>
 
          // CHECK: %[[LOADA_v128f16:.*]] = vector.bitcast %[[LOAD_RES]] : vector<64xi32> to vector<128xf16>
          %66 = vector.shape_cast %3: vector<16x8xf16> to vector<128xf16>
@@ -58,12 +58,12 @@ module @gemm attributes {gpu.container_module} {
          // CHECK: %[[OUT_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<4> : vector<16xindex>
          // CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OUT_OFFSET]] : vector<16xindex>
          // CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex>
-         %2 = xegpu.create_tdesc %arg2, %offsets2 {chunk_size = 8} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
+         %2 = xegpu.create_tdesc %arg2, %offsets2 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
          %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32>
          %8 = vector.shape_cast %7: vector<128xf32> to vector<16x8xf32>
 
          // CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v128f32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[C_ACC_v128f32]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<128xf32>) -> ()
-         xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+         xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1>
 
          gpu.return
       }

diff --git a/test/Conversion/XeTileToXeGPU/reduction.mlir b/test/Conversion/XeTileToXeGPU/reduction.mlir
@@ -10,10 +10,10 @@ module {
       %c0 = arith.constant 0 : index
       %acc = arith.constant dense<0.0> : vector<16xf16>
       //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
       //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-      //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<16x32xf16>
+      //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
       %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
 
       //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
@@ -179,9 +179,9 @@ module {
       %r = vector.multi_reduction <add>, %e, %acc [1] : vector<16x32xf16> to vector<16xf16>
       //CHECK: %[[R161:.*]] = vector.shape_cast %[[R160]] : vector<16xf16> to vector<2x8xf16>
       %c = vector.shape_cast %r: vector<16xf16> to vector<2x8xf16>
-      //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<2x8xf16>
-      //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %c, %s : vector<2x8xf16>, !xetile.tile<2x8xf16>
       gpu.return
     }
@@ -193,10 +193,10 @@ module {
       %c0 = arith.constant 0 : index
       %acc = arith.constant dense<0.0> : vector<32xf16>
       //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
       //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<16x32xf16>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
       %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
       //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
       //CHECK: %[[R3:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
@@ -318,9 +318,9 @@ module {
       %r = vector.multi_reduction <add>, %e, %acc [0] : vector<16x32xf16> to vector<32xf16>
       //CHECK: %[[R118:.*]] = vector.shape_cast %[[R117]] : vector<32xf16> to vector<4x8xf16>
       %c = vector.shape_cast %r: vector<32xf16> to vector<4x8xf16>
-      //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16>
-      //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %c, %s : vector<4x8xf16>, !xetile.tile<4x8xf16>
       gpu.return
     }