From 1d4d93d27a92a1d4e142afc939973701f079686f Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Fri, 2 Aug 2024 19:05:22 +0000
Subject: [PATCH] Move chunk_size into TensorDesc attribute

---
 ...0013-Move-chunk_size-into-TensorDesc.patch | 395 ++++++++++++++++++
 lib/Conversion/XeGPUToVC/XeGPUToVC.cpp        |   2 +-
 .../XeTileToXeGPU/XeTileOpConversion.cpp      |   3 +-
 test/Conversion/XeGPUToVC/atomiclsc.mlir      |   4 +-
 test/Conversion/XeGPUToVC/loadgather.mlir     |   8 +-
 .../Conversion/XeGPUToVC/loadgather_dpas.mlir |   8 +-
 test/Conversion/XeTileToXeGPU/reduction.mlir  |  16 +-
 .../sg_gemm_1k_1k_1k_f16_f32.mlir             | 222 +++++-----
 .../sg_gemm_1k_1k_1k_i8_i32.mlir              |  38 +-
 .../sg_gemm_1k_1k_1k_tf32_tf32.mlir           |  80 ++--
 .../XeTileToXeGPU/sg_load_tile.mlir           |   4 +-
 .../XeTileToXeGPU/sg_mixed_scf.mlir           |  26 +-
 test/Conversion/XeTileToXeGPU/sg_scf_for.mlir |  26 +-
 test/Conversion/XeTileToXeGPU/sg_softmax.mlir |  16 +-
 .../XeTileToXeGPU/sg_store_tile.mlir          |  32 +-
 .../Conversion/XeTileToXeGPU/sg_tile_mma.mlir |  12 +-
 .../XeTileToXeGPU/sg_tiled_broadcast.mlir     |   4 +-
 .../XeTileToXeGPU/sg_tiled_load_tile.mlir     |   4 +-
 .../XeTileToXeGPU/sg_tiled_scf_for.mlir       |  26 +-
 .../XeTileToXeGPU/sg_tiled_softmax.mlir       |  16 +-
 .../XeTileToXeGPU/sg_tiled_store_tile.mlir    |  32 +-
 .../XeTileToXeGPU/sg_tiled_tile_mma.mlir      |  12 +-
 test/Conversion/XeTileToXeGPU/test_order.mlir |  16 +-
 test/Dialect/XeGPU/IR/XeGPUOps.mlir           |   9 +-
 test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir      |  18 +-
 test/Dialect/XeGPU/IR/create_nd_tdesc.mlir    |  16 +-
 test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir |  16 +-
 test/Dialect/XeGPU/IR/create_tdesc_vc.mlir    |  32 +-
 test/Dialect/XeGPU/IR/invalid_vc.mlir         |  13 +-
 test/Dialect/XeGPU/IR/load_gather_vc.mlir     |  30 +-
 test/Dialect/XeGPU/IR/load_nd_vc.mlir         |   8 +-
 test/Dialect/XeGPU/IR/store_scatter.mlir      |  16 +-
 test/Dialect/XeGPU/IR/store_scatter_vc.mlir   |  17 +-
 test/Dialect/XeGPU/IR/update_offset_vc.mlir   |  12 +-
 .../XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir      |  30 +-
 ...kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir |  30 +-
 ...4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir |  30 +-
 .../load_with_block_array_16_16_2.vc.mlir     |   4 +-
 .../load_with_block_array_32_16_2.vc.mlir     |   4 +-
 .../load_with_block_array_8_16_2.vc.mlir      |   4 +-
 .../XeGPU/loadgather2d_masked_f32.mlir        |  20 +-
 .../XeGPU/loadgather2d_masked_slm_f32.mlir    |  20 +-
 .../XeGPU/loadgather_chunk_size_f32.mlir      |   8 +-
 .../XeGPU/loadgather_chunk_size_i32.mlir      |   8 +-
 .../Dialect/XeGPU/loadgather_f32.mlir         |   8 +-
 .../Dialect/XeGPU/loadgather_masked_f32.mlir  |   8 +-
 .../XeGPU/loadgather_masked_slm_f32.mlir      |   8 +-
 .../Dialect/XeGPU/loadgather_slm_f32.mlir     |   8 +-
 .../vector_extract_strided_slice_2.vc.mlir    |   4 +-
 .../postop_reduce_n.mlir                      |  76 ++--
 .../VectorLinearize/postop_reduce_n.mlir      |  76 ++--
 51 files changed, 962 insertions(+), 573 deletions(-)
 create mode 100644 build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch

diff --git a/build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch b/build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch
new file mode 100644
index 000000000..9b190e0e3
--- /dev/null
+++ b/build_tools/patches/0013-Move-chunk_size-into-TensorDesc.patch
@@ -0,0 +1,395 @@
+From 94685ba4f22afa8922feebe292e8b525b8d012b7 Mon Sep 17 00:00:00 2001
+From: Chao Chen <chao.chen@intel.com>
+Date: Mon, 29 Jul 2024 18:40:29 +0000
+Subject: [PATCH] Move chunk_size into TensorDesc
+
+---
+ .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 43 ++++++++++---
+ .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 12 ++--
+ .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 63 ++++++++++++-------
+ mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 41 ++++++++----
+ mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 21 ++++---
+ 5 files changed, 121 insertions(+), 59 deletions(-)
+
+diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+index f3ca09a6a68e..1dfe55a4bba0 100644
+--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
++++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+@@ -19,9 +19,15 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
+   let mnemonic = attrMnemonic;
+ }
+
+-def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
++class XeGPU_TensorDescAttr<string name, string attrMnemonic, list<Trait> traits = [],
++                         string baseCppClass = "::mlir::Attribute">
++    : XeGPUAttr<name, attrMnemonic, traits, baseCppClass> {
++  let assemblyFormat = "`<` struct(params) `>`";
++}
++
++def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> {
+   let summary = [{a composite attribute for `TensorDescType`}];
+-  let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite
++  let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite
+     attribute defined for `TensorDescType` for describing following
+     properties of a `TensorDesc`.
+     1. `memory_scope`: It describes where the data block described by the
+@@ -33,27 +39,46 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+         8x32. Its default value is 1.
+     3. `boundary_check`: It is used to indicates the hardware whether to do
+         out-of-boundary check. The default value is true.
+-    4. `scattered`: It is used to differenciate TensorDescs created from
+-       `create_nd_tdesc` vs from `create_tdesc`.
+   }];
+
+   let parameters = (ins
+     OptionalParameter<"MemoryScopeAttr">: $memory_scope,
+     OptionalParameter<"IntegerAttr", "1">: $array_length,
+-    OptionalParameter<"BoolAttr", "true">: $boundary_check,
+-    OptionalParameter<"BoolAttr", "false">: $scattered
++    OptionalParameter<"BoolAttr", "true">: $boundary_check
+   );
+
+   let builders = [
+     AttrBuilder<(ins
+       CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+       CArg<"int", "1">:$array_length,
+-      CArg<"bool", "true">: $boundary_check,
+-      CArg<"bool", "false">: $scattered
++      CArg<"bool", "true">: $boundary_check
+     )>
+   ];
++}
+
+-  let assemblyFormat = "`<` struct(params) `>`";
++def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> {
++  let summary = [{a composite attribute for `TensorDescType`}];
++  let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite
++    attribute defined for `TensorDescType` for describing following
++    properties of a `TensorDesc`.
++    1. `memory_scope`: It describes where the data block described by the
++        TensorDesc is located, `Global` device memory or `Shared` local memory.
++        It is default to `Global`.
++    2.  `chunk_size`: indicates number of continious elements accessed for each
++        offset, default is 1. It is used with `scattered` attr only.
++  }];
++
++  let parameters = (ins
++    OptionalParameter<"MemoryScopeAttr">: $memory_scope,
++    OptionalParameter<"IntegerAttr", "1">: $chunk_size
++  );
++
++  let builders = [
++    AttrBuilder<(ins
++      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
++      CArg<"int", "1">: $chunk_size
++    )>
++  ];
+ }
+
+ //===----------------------------------------------------------------------===//
+diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+index 7111126f9c28..d3b38836b70b 100644
+--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
++++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+@@ -403,33 +403,31 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
+       is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
+       implying each element in the array corresponds to a work-item (SIMT lane)
+       in the subgroup.
+-    * chunk_size: [optional attribute] indicates number of continious
+-      elements accessed for each offset, default is 1.
++
+
+     Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+     ```
+     %a = memref.alloc() : memref<1024xf32>
+-    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
++    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32, chunk_size_per_lane = 1>
+     ```
+
+     Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+                It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+     ```
+     %0 = memref.alloc() : memref<1024xf32>
+-    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
++    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>
+     ```
+
+     Example 3. It is similar to Example 2, but there is some overlaps among workitems.
+                It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
+     ```
+     %0 = memref.alloc() : memref<1024xf32>
+-    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
++    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>>
+     ```
+   }];
+
+   let arguments = (ins XeGPU_BaseAddrType: $source,
+-                       XeGPU_OffsetType: $offsets,
+-                       DefaultValuedAttr<I64Attr, "1">: $chunk_size);
++                       XeGPU_OffsetType: $offsets);
+   let results = (outs XeGPU_TensorDesc:$TensorDesc);
+
+   let assemblyFormat = [{
+diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+index 111a270a28b2..0c4dc11256d5 100644
+--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
++++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+@@ -88,11 +88,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
+     TypeBuilderWithInferredContext<(ins
+       "llvm::ArrayRef<int64_t>": $shape,
+       "mlir::Type": $elementType,
+-      CArg<"bool", "false">: $scattered,
+       CArg<"int", "1">: $array_length,
+-      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+-      CArg<"bool", "true">: $boundary_check
+-    )>
++      CArg<"bool", "true">: $boundary_check,
++      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope)>,
++    TypeBuilderWithInferredContext<(ins
++      "llvm::ArrayRef<int64_t>": $shape,
++      "mlir::Type": $elementType,
++      CArg<"int", "1">: $chunk_size,
++      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope)>
+   ];
+
+   let extraClassDeclaration = [{
+@@ -110,40 +113,58 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
+       return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
+     }
+
+-    TensorDescAttr getEncodingAsTensorDescAttr() const {
+-      return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
++    BlockTensorDescAttr getEncodingAsBlockTensorDescAttr() const {
++      return llvm::dyn_cast_if_present<BlockTensorDescAttr>(getEncoding());
++    }
++
++    ScatterTensorDescAttr getEncodingAsScatterTensorDescAttr() const {
++      return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
+     }
+
+     xegpu::MemoryScope getMemoryScope() const {
+-      auto attr = getEncodingAsTensorDescAttr();
+-      if (attr && attr.getMemoryScope())
+-        return attr.getMemoryScope().getValue();
++      auto block_attr = getEncodingAsBlockTensorDescAttr();
++      if (block_attr && block_attr.getMemoryScope())
++        return block_attr.getMemoryScope().getValue();
++
++      auto scatter_attr = getEncodingAsScatterTensorDescAttr();
++      if (scatter_attr && scatter_attr.getMemoryScope())
++        return scatter_attr.getMemoryScope().getValue();
++
+       // return default value
+       return MemoryScope::Global;
+     }
+
+     int getArrayLength() {
+-      auto attr = getEncodingAsTensorDescAttr();
+-      if (attr && attr.getArrayLength())
+-        return attr.getArrayLength().getInt();
++      auto attr = getEncoding();
++      auto block_attr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(attr);
++      assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr.");
++      if (block_attr && block_attr.getArrayLength())
++        return block_attr.getArrayLength().getInt();
+       // return default value
+       return 1;
+     }
+
+     bool getBoundaryCheck() {
+-      auto attr = getEncodingAsTensorDescAttr();
+-      if (attr && attr.getBoundaryCheck())
+-        return attr.getBoundaryCheck().getValue();
++      auto attr = getEncoding();
++      auto block_attr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(attr);
++      assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr.");
++      if (block_attr && block_attr.getBoundaryCheck())
++        return block_attr.getBoundaryCheck().getValue();
+       // return default value
+       return true;
+     }
+
+-    bool getScattered() {
+-      auto attr = getEncodingAsTensorDescAttr();
+-      if (attr && attr.getScattered())
+-        return attr.getScattered().getValue();
+-      // return default value
+-      return false;
++    bool isScattered() {
++      return bool(getEncodingAsScatterTensorDescAttr());
++    }
++
++    int getChunkSize() {
++      auto attr = getEncoding();
++      auto scatter_attr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(attr);
++      assert((!attr || scatter_attr) && "invalid on non ScatterTensorDescAttr.");
++      if (scatter_attr && scatter_attr.getChunkSize())
++        return scatter_attr.getChunkSize().getInt();
++      return 1;
+     }
+   }];
+
+diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+index 24719fe748fe..a5632c3fab8c 100644
+--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
++++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+@@ -30,20 +30,31 @@ void XeGPUDialect::initialize() {
+ }
+
+ //===----------------------------------------------------------------------===//
+-// XeGPU_TensorDescAttr
++// XeGPU_BlockTensorDescAttr
+ //===----------------------------------------------------------------------===//
+-TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context,
+-                                   xegpu::MemoryScope memory_scope,
+-                                   int array_length, bool boundary_check,
+-                                   bool scattered) {
++BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context,
++                                        xegpu::MemoryScope memory_scope,
++                                        int array_length, bool boundary_check) {
+   auto scopeAttr = MemoryScopeAttr::get(context, memory_scope);
+   auto lengthAttr =
+       IntegerAttr::get(IntegerType::get(context, 64), array_length);
+   auto boundaryAttr = BoolAttr::get(context, boundary_check);
+-  auto scatteredAttr = BoolAttr::get(context, scattered);
+-  return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr);
++  return Base::get(context, scopeAttr, lengthAttr, boundaryAttr);
+ }
+
++//===----------------------------------------------------------------------===//
++// XeGPU_ScatterTensorDescAttr
++//===----------------------------------------------------------------------===//
++ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context,
++                                            xegpu::MemoryScope memory_scope,
++                                            int chunk_size) {
++  auto scopeAttr = MemoryScopeAttr::get(context, memory_scope);
++  auto chunkSizeAttr =
++      IntegerAttr::get(IntegerType::get(context, 64), chunk_size);
++  return Base::get(context, scopeAttr, chunkSizeAttr);
++}
++
++
+ //===----------------------------------------------------------------------===//
+ // XeGPU_TensorDescType
+ //===----------------------------------------------------------------------===//
+@@ -108,12 +119,18 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
+ }
+
+ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
+-                                   mlir::Type elementType, bool scattered,
+-                                   int array_length, MemoryScope memory_scope,
+-                                   bool boundary_check) {
++                                   mlir::Type elementType, int array_length,
++                                   bool boundary_check, MemoryScope memory_scope) {
++  auto context = elementType.getContext();
++  auto attr = BlockTensorDescAttr::get(context, memory_scope, array_length, boundary_check);
++  return Base::get(context, shape, elementType, attr);
++}
++
++TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
++                                   mlir::Type elementType, int chunk_size,
++                                   MemoryScope memory_scope) {
+   auto context = elementType.getContext();
+-  auto attr = TensorDescAttr::get(context, memory_scope, array_length,
+-                                  boundary_check, scattered);
++  auto attr = ScatterTensorDescAttr::get(context, memory_scope, chunk_size);
+   return Base::get(context, shape, elementType, attr);
+ }
+
+diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+index 2bdc87f36fa3..7591316d9fe1 100644
+--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
++++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+@@ -149,7 +149,7 @@ LogicalResult CreateNdDescOp::verify() {
+     return emitOpError("TensorDesc should have the same element "
+                        "type with the source if it is a memref.\n");
+
+-  if (getType().getScattered())
++  if (getType().isScattered())
+     return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+   return success();
+@@ -160,7 +160,7 @@ LogicalResult CreateNdDescOp::verify() {
+ //===----------------------------------------------------------------------===//
+ LogicalResult PrefetchNdOp::verify() {
+   auto tdescTy = getTensorDescType();
+-  if (tdescTy.getScattered())
++  if (tdescTy.isScattered())
+     return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+   if (!isReadHintOrNone(getL1HintAttr()))
+@@ -185,7 +185,7 @@ LogicalResult LoadNdOp::verify() {
+   if (tdescTy.getRank() > 2)
+     return emitOpError("Expecting a 1D/2D TensorDesc.\n");
+
+-  if (tdescTy.getScattered())
++  if (tdescTy.isScattered())
+     return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+   if (!valueTy)
+@@ -253,7 +253,7 @@ LogicalResult StoreNdOp::verify() {
+   if (dstTy.getRank() > 2)
+     return emitOpError("Expecting a 1D/2D TensorDesc.\n");
+
+-  if (dstTy.getScattered())
++  if (dstTy.isScattered())
+     return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+   if (!valTy)
+@@ -276,7 +276,7 @@ LogicalResult StoreNdOp::verify() {
+ //===----------------------------------------------------------------------===//
+ LogicalResult UpdateNdOffsetOp::verify() {
+   auto ty = getTensorDescType();
+-  if (ty.getScattered())
++  if (ty.isScattered())
+     return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+   // number of offsets specified must match the rank of the tensor descriptor
+@@ -291,15 +291,16 @@ LogicalResult UpdateNdOffsetOp::verify() {
+ //===----------------------------------------------------------------------===//
+ LogicalResult CreateDescOp::verify() {
+   auto tdescTy = getTensorDescType();
+-  auto chunkSize = getChunkSize();
+
+   if (getRankOf(getSource()) > 1)
+     return emitOpError(
+         "Expecting the source is a 1D memref or pointer (uint64_t).");
+
+-  if (!tdescTy.getScattered())
++  if (!tdescTy.isScattered())
+     return emitOpError("Expects a scattered TensorDesc.\n");
+
++  auto chunkSize = tdescTy.getChunkSize();
++
+   SmallVector<int64_t> shape({(int64_t)getNumOffsets()});
+   if (chunkSize != 1)
+     shape.push_back(chunkSize);
+@@ -317,7 +318,7 @@ LogicalResult CreateDescOp::verify() {
+ //===----------------------------------------------------------------------===//
+ LogicalResult PrefetchOp::verify() {
+   auto tdescTy = getTensorDescType();
+-  if (!tdescTy.getScattered())
++  if (!tdescTy.isScattered())
+     return emitOpError("Expects a scattered TensorDesc.\n");
+
+   if (!isReadHintOrNone(getL1HintAttr()))
+@@ -340,7 +341,7 @@ LogicalResult LoadGatherOp::verify() {
+   auto maskTy = getMaskType();
+   auto valueTy = getValueType();
+
+-  if (!tdescTy.getScattered())
++  if (!tdescTy.isScattered())
+     return emitOpError("Expects a scattered TensorDesc.\n");
+
+   if (!isReadHintOrNone(getL1HintAttr()))
+@@ -386,7 +387,7 @@ LogicalResult LoadGatherOp::verify() {
+ //===----------------------------------------------------------------------===//
+ LogicalResult StoreScatterOp::verify() {
+   auto tdescTy = getTensorDescType();
+-  if (!tdescTy.getScattered())
++  if (!tdescTy.isScattered())
+     return emitOpError("Expects a scattered TensorDesc.\n");
+
+   if (!isWriteHintOrNone(getL1HintAttr()))
+--
+2.34.1
diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
index bfee41a47..28bc8f4be 100644
--- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
+++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
@@ -1407,7 +1407,7 @@ struct XeGPUToVCPass : public ::imex::ConvertXeGPUToVCBase<XeGPUToVCPass> {
 
     typeConverter.addConversion(
         [&](xegpu::TensorDescType type) -> ::mlir::Type {
-          if (type.getScattered()) {
+          if (type.isScattered()) {
             return ::mlir::VectorType::get(
                 16, ::mlir::IndexType::get(&getContext()));
           }
diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
index 8fc2fdf10..e23ffd143 100644
--- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
+++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
@@ -461,8 +461,7 @@ class SgInitTileOpPattern : public XeOneToNConversion<xetile::InitTileOp> {
       std::swap(offsetsX, offsetsY);
 
     auto tDescTy = mlir::xegpu::TensorDescType::get(
-        innerBlk, elemTy, false /*scattered*/, array_length, memoryScope,
-        true /*boundary_check*/);
+        innerBlk, elemTy, array_length, true /*boundary_check*/, memoryScope);
 
     auto createIndexConstant = [&](mlir::Type type, int64_t value) {
       auto attr = rewriter.getIndexAttr(value);
diff --git a/test/Conversion/XeGPUToVC/atomiclsc.mlir b/test/Conversion/XeGPUToVC/atomiclsc.mlir
index 58951cf1e..af2d8c0f4 100644
--- a/test/Conversion/XeGPUToVC/atomiclsc.mlir
+++ b/test/Conversion/XeGPUToVC/atomiclsc.mlir
@@ -22,14 +22,14 @@ module @gemm attributes {gpu.container_module} {
       // CHECK: %[[ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<4> : vector<16xindex>
       // CHECK: %[[OFFSETS_ADJUSTED:.*]] = arith.muli %[[ELEMENT_BYTEWIDTH]], %[[OFFSETS]] : vector<16xindex>
       // CHECK: %[[VEC_OFFSETS_APPLIED:.*]] = arith.addi %[[VEC_BASEPTR_SHUFFLED]], %[[OFFSETS_ADJUSTED]] : vector<16xindex>
-      %2 = xegpu.create_tdesc %arg0, %offsets {chunk_size = 1} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+      %2 = xegpu.create_tdesc %arg0, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
       // CHECK: %[[cst_3:.*]] = arith.constant dense<true> : vector<16xi1>
       // CHECK: %[[cst_8:.*]] = arith.constant dense<0> : vector<16xi32>
       // CHECK: %[[SRC0:.*]] = vector.bitcast %[[cst_0]] : vector<16xf32> to vector<16xi32>
       // CHECK: %[[ATOMIC_RES:.*]] = func.call @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64({{.*}}, %[[VEC_OFFSETS_APPLIED]], %[[SRC0]], %[[cst_8]], {{.*}}, %[[cst_8]]) : ({{.*}}) -> vector<16xi32>
       // CHECK: %{{.*}} = vector.bitcast %[[ATOMIC_RES]] : vector<16xi32> to vector<16xf32>
-      %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+      %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
       gpu.return
     }
  }
diff --git a/test/Conversion/XeGPUToVC/loadgather.mlir b/test/Conversion/XeGPUToVC/loadgather.mlir
index bcad0a2e1..7109d51dc 100644
--- a/test/Conversion/XeGPUToVC/loadgather.mlir
+++ b/test/Conversion/XeGPUToVC/loadgather.mlir
@@ -19,7 +19,7 @@ module @gemm attributes {gpu.container_module} {
       // CHECK: %[[IN_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
       // CHECK: %[[IN_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[IN_ELEMENT_BYTEWIDTH]], %[[OFFSET]] : vector<16xindex>
       // CHECK: %[[IN_PAYLOAD:.*]] = arith.addi %[[IN_PAYLOAD_BASEPTR_SHUFFLED]], %[[IN_ELEMENTWISE_OFFSET]] : vector<16xindex>
-      %tdesc_in = xegpu.create_tdesc %in, %offsets {chunk_size = 2} : memref<?xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>
+      %tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
 
       // CHECK: %[[OUT_EMPTY_PAYLOAD:.*]] = arith.constant dense<0> : vector<16xindex>
       // CHECK: %[[OUT_BASEPTR:.*]] = memref.extract_aligned_pointer_as_index {{.*}} : memref<32xf16> -> index
@@ -28,16 +28,16 @@ module @gemm attributes {gpu.container_module} {
       // CHECK: %[[OUT_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
       // CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OFFSET]] : vector<16xindex>
       // CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex>
-      %tdesc_out = xegpu.create_tdesc %out_flat, %offsets {chunk_size = 2} : memref<32xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>
+      %tdesc_out = xegpu.create_tdesc %out_flat, %offsets {chunk_size = 2} : memref<32xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
 
       // CHECK: %[[OLD:.*]] =  arith.constant dense<0> : vector<16xi32>
       // CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v16i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> vector<16xi32>
-      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16x2xf16>
+      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xf16>
       // CHECK: %[[POST_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[LOAD_RES]] : vector<16xi32> to vector<32xf16>
 
       // CHECK: %[[PRE_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[POST_OP_ELEMENT_TYPE_CAST]] : vector<32xf16> to vector<16xi32>
       // CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v16i32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[PRE_OP_ELEMENT_TYPE_CAST]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> ()
-      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
 
       gpu.return
     }
diff --git a/test/Conversion/XeGPUToVC/loadgather_dpas.mlir b/test/Conversion/XeGPUToVC/loadgather_dpas.mlir
index 600888f0e..1e45fa2d4 100644
--- a/test/Conversion/XeGPUToVC/loadgather_dpas.mlir
+++ b/test/Conversion/XeGPUToVC/loadgather_dpas.mlir
@@ -19,11 +19,11 @@ module @gemm attributes {gpu.container_module} {
          // CHECK: %[[IN_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<2> : vector<16xindex>
          // CHECK: %[[IN_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[IN_ELEMENT_BYTEWIDTH]], %[[IN_OFFSET]] : vector<16xindex>
          // CHECK: %[[IN_PAYLOAD:.*]] = arith.addi %[[IN_PAYLOAD_BASEPTR_SHUFFLED]], %[[IN_ELEMENTWISE_OFFSET]] : vector<16xindex>
-         %0 = xegpu.create_tdesc %arg0, %offsets {chunk_size = 8} : memref<128xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr<scattered = true>>
+         %0 = xegpu.create_tdesc %arg0, %offsets : memref<128xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
 
          // CHECK: %[[OLD:.*]] =  arith.constant dense<0> : vector<64xi32>
          // CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v64i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<64xi32>) -> vector<64xi32>
-         %3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16x8xf16>
+         %3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1> -> vector<16x8xf16>
 
          // CHECK: %[[LOADA_v128f16:.*]] = vector.bitcast %[[LOAD_RES]] : vector<64xi32> to vector<128xf16>
          %66 = vector.shape_cast %3: vector<16x8xf16> to vector<128xf16>
@@ -58,12 +58,12 @@ module @gemm attributes {gpu.container_module} {
          // CHECK: %[[OUT_ELEMENT_BYTEWIDTH:.*]] = arith.constant dense<4> : vector<16xindex>
          // CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OUT_OFFSET]] : vector<16xindex>
          // CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex>
-         %2 = xegpu.create_tdesc %arg2, %offsets2 {chunk_size = 8} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
+         %2 = xegpu.create_tdesc %arg2, %offsets2 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
          %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32>
          %8 = vector.shape_cast %7: vector<128xf32> to vector<16x8xf32>
 
          // CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v128f32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[C_ACC_v128f32]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<128xf32>) -> ()
-         xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+         xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1>
 
          gpu.return
       }
diff --git a/test/Conversion/XeTileToXeGPU/reduction.mlir b/test/Conversion/XeTileToXeGPU/reduction.mlir
index 302e1b6df..6c50907ed 100644
--- a/test/Conversion/XeTileToXeGPU/reduction.mlir
+++ b/test/Conversion/XeTileToXeGPU/reduction.mlir
@@ -10,10 +10,10 @@ module {
       %c0 = arith.constant 0 : index
       %acc = arith.constant dense<0.0> : vector<16xf16>
       //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
       //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-      //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<16x32xf16>
+      //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
       %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
 
       //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
@@ -179,9 +179,9 @@ module {
       %r = vector.multi_reduction <add>, %e, %acc [1] : vector<16x32xf16> to vector<16xf16>
       //CHECK: %[[R161:.*]] = vector.shape_cast %[[R160]] : vector<16xf16> to vector<2x8xf16>
       %c = vector.shape_cast %r: vector<16xf16> to vector<2x8xf16>
-      //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<2x8xf16>
-      //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %c, %s : vector<2x8xf16>, !xetile.tile<2x8xf16>
       gpu.return
     }
@@ -193,10 +193,10 @@ module {
       %c0 = arith.constant 0 : index
       %acc = arith.constant dense<0.0> : vector<32xf16>
       //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
       //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<16x32xf16>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
       %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
       //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
       //CHECK: %[[R3:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
@@ -318,9 +318,9 @@ module {
       %r = vector.multi_reduction <add>, %e, %acc [0] : vector<16x32xf16> to vector<32xf16>
       //CHECK: %[[R118:.*]] = vector.shape_cast %[[R117]] : vector<32xf16> to vector<4x8xf16>
       %c = vector.shape_cast %r: vector<32xf16> to vector<4x8xf16>
-      //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16>
-      //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %c, %s : vector<4x8xf16>, !xetile.tile<4x8xf16>
       gpu.return
     }
diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir
index 3842da6ec..2040fd824 100644
--- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir
@@ -25,95 +25,95 @@ gpu.module @test_kernel {
 
     //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index
     //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index
-    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
     //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c32:.*]] = arith.constant 32 : index
     //CHECK: %[[r7:.*]] = arith.addi %[[r1]], %[[c32]] : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c48:.*]] = arith.constant 48 : index
     //CHECK: %[[r9:.*]] = arith.addi %[[r1]], %[[c48]] : index
-    //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
     //CHECK: %[[r11:.*]] = arith.addi %[[r0]], %[[c8]] : index
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r16:.*]] = arith.addi %[[r0]], %[[c16]] : index
-    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r19:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r19:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
     //CHECK: %[[r21:.*]] = arith.addi %[[r0]], %[[c24]] : index
-    //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r24:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r25:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r24:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r25:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r26:.*]] = arith.addi %[[r0]], %[[c32]] : index
-    //CHECK: %[[r27:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r28:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r29:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r30:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r27:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r28:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r29:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r30:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c40:.*]] = arith.constant 40 : index
     //CHECK: %[[r31:.*]] = arith.addi %[[r0]], %[[c40]] : index
-    //CHECK: %[[r32:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r33:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r34:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r35:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r32:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r33:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r34:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r35:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r36:.*]] = arith.addi %[[r0]], %[[c48]] : index
-    //CHECK: %[[r37:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r38:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r39:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r40:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r37:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r38:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r39:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r40:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c56:.*]] = arith.constant 56 : index
     //CHECK: %[[r41:.*]] = arith.addi %[[r0]], %[[c56]] : index
-    //CHECK: %[[r42:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r43:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r44:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r45:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r46:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r47:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r48:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r49:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r50:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r51:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r52:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r53:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r42:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r43:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r44:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r45:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r46:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r47:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r48:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r49:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r50:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r51:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r52:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r53:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<64x64xf32>
 
-    //CHECK: %[[r54:.*]] = xegpu.load_nd %[[r46]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
-    //CHECK: %[[r55:.*]] = xegpu.load_nd %[[r47]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
-    //CHECK: %[[r56:.*]] = xegpu.load_nd %[[r48]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
-    //CHECK: %[[r57:.*]] = xegpu.load_nd %[[r49]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
-    //CHECK: %[[r58:.*]] = xegpu.load_nd %[[r50]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
-    //CHECK: %[[r59:.*]] = xegpu.load_nd %[[r51]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
-    //CHECK: %[[r60:.*]] = xegpu.load_nd %[[r52]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
-    //CHECK: %[[r61:.*]] = xegpu.load_nd %[[r53]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
+    //CHECK: %[[r54:.*]] = xegpu.load_nd %[[r46]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r55:.*]] = xegpu.load_nd %[[r47]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r56:.*]] = xegpu.load_nd %[[r48]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r57:.*]] = xegpu.load_nd %[[r49]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r58:.*]] = xegpu.load_nd %[[r50]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r59:.*]] = xegpu.load_nd %[[r51]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r60:.*]] = xegpu.load_nd %[[r52]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r61:.*]] = xegpu.load_nd %[[r53]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
     %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<64x64xf32> -> vector<64x64xf32>
 
-    //CHECK: %[[r62:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r63:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r64:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r65:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r62:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r63:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r64:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r65:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
     %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16>
 
-    //CHECK: %[[r66:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r67:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r68:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r69:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r66:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r67:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r68:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r69:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
     %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16>
 
     //CHECK: %[[r72:.*]]:16 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]]
     //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[r62]], %[[arg5:.*]] = %[[r63]], %[[arg6:.*]] = %[[r64]], %[[arg7:.*]] = %[[r65]], %[[arg8:.*]] = %[[r66]],
     //CHECK-SAME: %[[arg9:.*]] = %[[r67]], %[[arg10:.*]] = %[[r68]], %[[arg11:.*]] = %[[r69]], %[[arg12:.*]] = %[[r54]], %[[arg13:.*]] = %[[r55]],
     //CHECK-SAME: %[[arg14:.*]] = %[[r56]], %[[arg15:.*]] = %[[r57]], %[[arg16:.*]] = %[[r58]], %[[arg17:.*]] = %[[r59]], %[[arg18:.*]] = %[[r60]],
-    //CHECK-SAME: %[[arg19:.*]] = %[[r61]]) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
+    //CHECK-SAME: %[[arg19:.*]] = %[[r61]]) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
     //CHECK-SAME: vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>) {
     %out:3 = scf.for %k = %c0 to %c1024 step %c64
       iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value)
@@ -152,16 +152,16 @@ gpu.module @test_kernel {
       //CHECK: %[[r208:.*]] = vector.extract_strided_slice %[[arg19]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
 
 
-      //CHECK: %[[r105:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xf16>
+      //CHECK: %[[r105:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r106:.*]] = vector.extract %[[r105]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r107:.*]] = vector.extract %[[r105]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r108:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xf16>
+      //CHECK: %[[r108:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r109:.*]] = vector.extract %[[r108]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r110:.*]] = vector.extract %[[r108]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r111:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xf16>
+      //CHECK: %[[r111:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r112:.*]] = vector.extract %[[r111]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r113:.*]] = vector.extract %[[r111]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r114:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xf16>
+      //CHECK: %[[r114:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r115:.*]] = vector.extract %[[r114]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r116:.*]] = vector.extract %[[r114]][1] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r117:.*]] = vector.extract_strided_slice %[[r106]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
@@ -198,16 +198,16 @@ gpu.module @test_kernel {
       //CHECK: %[[r148:.*]] = vector.extract_strided_slice %[[r116]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
       %a_value = xetile.load_tile %a_tile : !xetile.tile<64x64xf16> -> vector<64x64xf16>
 
-      //CHECK: %[[r149:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xf16>
+      //CHECK: %[[r149:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
       //CHECK: %[[r150:.*]] = vector.extract %[[r149]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
       //CHECK: %[[r151:.*]] = vector.extract %[[r149]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
-      //CHECK: %[[r152:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xf16>
+      //CHECK: %[[r152:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
       //CHECK: %[[r153:.*]] = vector.extract %[[r152]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
       //CHECK: %[[r154:.*]] = vector.extract %[[r152]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
-      //CHECK: %[[r155:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xf16>
+      //CHECK: %[[r155:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
       //CHECK: %[[r156:.*]] = vector.extract %[[r155]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
       //CHECK: %[[r157:.*]] = vector.extract %[[r155]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
-      //CHECK: %[[r158:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xf16>
+      //CHECK: %[[r158:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
       //CHECK: %[[r159:.*]] = vector.extract %[[r158]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
       //CHECK: %[[r160:.*]] = vector.extract %[[r158]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
       //CHECK: %[[r161:.*]] = vector.extract_strided_slice %[[r150]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xf16> to vector<8x16x2xf16>
@@ -384,21 +384,21 @@ gpu.module @test_kernel {
       //CHECK: %[[r358:.*]] = vector.shuffle %[[r288]], %[[r304]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
       //CHECK: %[[r359:.*]] = vector.shuffle %[[r320]], %[[r336]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
       //CHECK: %[[r360:.*]] = vector.shuffle %[[r358]], %[[r359]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x16xf32>, vector<16x16xf32>
-      //CHECK: %[[r361:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r362:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r363:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r364:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r365:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r366:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r367:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r368:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r361:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r362:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r363:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r364:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r365:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r366:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r367:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r368:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
       %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c64] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16>
       %b_next_tile = xetile.update_tile_offset %b_tile, [%c64, %c0] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16>
 
       //CHECK: scf.yield %[[r361]], %[[r362]], %[[r363]], %[[r364]], %[[r365]], %[[r366]], %[[r367]], %[[r368]], %[[r339]], %[[r345]], %[[r351]], %[[r357]], %[[r342]], %[[r348]], %[[r354]], %[[r360]]
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, vector<32x16xf32>, vector<32x16xf32>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>,
       //CHECK-SAME: vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>
       scf.yield %a_next_tile, %b_next_tile, %c_new_value
         : !xetile.tile<64x64xf16>, !xetile.tile<64x64xf16>, vector<64x64xf32>
@@ -435,38 +435,38 @@ gpu.module @test_kernel {
     //CHECK: %[[r102:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     //CHECK: %[[r103:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     //CHECK: %[[r104:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
-    //CHECK: xegpu.store_nd %[[r73]], %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r77]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r81]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r85]], %[[r10]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r74]], %[[r12]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r78]], %[[r13]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r82]], %[[r14]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r86]], %[[r15]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r75]], %[[r17]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r79]], %[[r18]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r83]], %[[r19]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r87]], %[[r20]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r76]], %[[r22]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r80]], %[[r23]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r84]], %[[r24]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r88]], %[[r25]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r89]], %[[r27]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r93]], %[[r28]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r97]], %[[r29]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r101]], %[[r30]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r90]], %[[r32]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r94]], %[[r33]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r98]], %[[r34]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r102]], %[[r35]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r91]], %[[r37]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r95]], %[[r38]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r99]], %[[r39]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r103]], %[[r40]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r92]], %[[r42]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r96]], %[[r43]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r100]], %[[r44]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r104]], %[[r45]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: xegpu.store_nd %[[r73]], %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r77]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r81]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r85]], %[[r10]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r74]], %[[r12]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r78]], %[[r13]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r82]], %[[r14]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r86]], %[[r15]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r75]], %[[r17]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r79]], %[[r18]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r83]], %[[r19]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r87]], %[[r20]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r76]], %[[r22]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r80]], %[[r23]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r84]], %[[r24]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r88]], %[[r25]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r89]], %[[r27]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r93]], %[[r28]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r97]], %[[r29]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r101]], %[[r30]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r90]], %[[r32]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r94]], %[[r33]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r98]], %[[r34]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r102]], %[[r35]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r91]], %[[r37]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r95]], %[[r38]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r99]], %[[r39]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r103]], %[[r40]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r92]], %[[r42]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r96]], %[[r43]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r100]], %[[r44]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r104]], %[[r45]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %out#2, %c_init_tile: vector<64x64xf32>, !xetile.tile<64x64xf32>
 
     gpu.return
diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir
index 4dc97b4cc..2c7400432 100644
--- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir
@@ -24,44 +24,44 @@ gpu.module @test_kernel {
 
     //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index
     //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index
-    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
     //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
     //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c8]] : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r10:.*]] = arith.addi %[[r0]], %[[c16]] : index
-    //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
     //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c24]] : index
-    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xi32>
-    //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xi32>
+    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xi32>
+    //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xi32>
     %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xi32> -> !xetile.tile<32x32xi32>
     %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<32x32xi32> -> vector<32x32xi32>
 
-    //CHECK: %20 = xegpu.create_nd_tdesc %[[arg0]][%2, %[[c0]]] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x32xi8, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %20 = xegpu.create_nd_tdesc %[[arg0]][%2, %[[c0]]] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xi8> -> !xetile.tile<32x32xi8>
 
-    //CHECK: %21 = xegpu.create_nd_tdesc %[[arg1]][%c0, %3] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x16xi8, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %21 = xegpu.create_nd_tdesc %[[arg1]][%c0, %3] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
     %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xi8> -> !xetile.tile<32x32xi8>
     %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value)
                   -> (!xetile.tile<32x32xi8>, !xetile.tile<32x32xi8>, vector<32x32xi32>) {
 
-      //CHECK: %[[r39:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xi8, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xi8>
+      //CHECK: %[[r39:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xi8>
       //CHECK: %[[r40:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8>
       //CHECK: %[[r41:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [8, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8>
       //CHECK: %[[r42:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [16, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8>
       //CHECK: %[[r43:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [24, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8>
       %a_value = xetile.load_tile %a_tile : !xetile.tile<32x32xi8> -> vector<32x32xi8>
 
-      //CHECK: %[[r44:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xi8, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x8x16x4xi8>
+      //CHECK: %[[r44:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x8x16x4xi8>
       //CHECK: %[[r45:.*]] = vector.extract %[[r44]][0] : vector<8x16x4xi8> from vector<2x8x16x4xi8>
       //CHECK: %[[r46:.*]] = vector.extract %[[r44]][1] : vector<8x16x4xi8> from vector<2x8x16x4xi8>
       %b_value = xetile.load_tile %b_tile : !xetile.tile<32x32xi8> -> vector<32x32xi8>
@@ -69,14 +69,14 @@ gpu.module @test_kernel {
       //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x32xi8>, vector<8x16x4xi8>, vector<8x16xi32> -> vector<8x16xi32>
       %c_new_value = xetile.tile_mma %a_value, %b_value, %c_value : vector<32x32xi8>, vector<32x32xi8>, vector<32x32xi32> -> vector<32x32xi32>
 
-      //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c0]], %[[c32]]] : !xegpu.tensor_desc<32x32xi8, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-      //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c32]], %[[c0]]] : !xegpu.tensor_desc<32x16xi8, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+      //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c0]], %[[c32]]] : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c32]], %[[c0]]] : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
       %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c32] : !xetile.tile<32x32xi8>, index, index -> !xetile.tile<32x32xi8>
       %b_next_tile = xetile.update_tile_offset %b_tile, [%c32, %c0] : !xetile.tile<32x32xi8>, index, index -> !xetile.tile<32x32xi8>
       scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<32x32xi8>, !xetile.tile<32x32xi8>, vector<32x32xi32>
     }
 
-    //CHECK-COUNT-8: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK-COUNT-8: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %out#2, %c_init_tile {innner_blocks = [8, 16]}: vector<32x32xi32>, !xetile.tile<32x32xi32>
     gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir
index 2e3e80358..8108c360c 100755
--- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir
@@ -23,43 +23,43 @@ gpu.module @test_kernel {
 
     //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index
     //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index
-    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
     //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
     //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c8]] : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r10:.*]] = arith.addi %[[r0]], %[[c16]] : index
-    //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
     //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c24]] : index
-    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     %2 = xetile.init_tile %arg2[%0, %1] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32>
 
-    //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
-    //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xf32>
+    //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
     %3 = xetile.load_tile %2 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf32> -> vector<32x32xf32>
 
-    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c16]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c16]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
     %4 = xetile.init_tile %arg0[%0, %c0] : memref<1024x1024xtf32> -> !xetile.tile<32x32xtf32>
 
-    //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r5]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r5]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     %5 = xetile.init_tile %arg1[%c0, %1] : memref<1024x1024xtf32> -> !xetile.tile<32x32xtf32>
 
     //CHECK: %[[r24:.*]]:6 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]]
     //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[r20]], %[[arg5:.*]] = %[[r21]], %[[arg6:.*]] = %[[r22]], %[[arg7:.*]] = %[[r23]], %[[arg8:.*]] = %[[r18]], %[[arg9:.*]] = %[[r19]])
-    //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, vector<32x16xf32>, vector<32x16xf32>
+    //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
     %6:3 = scf.for %arg3 = %c0 to %c1024 step %c64 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) -> (!xetile.tile<32x32xtf32>, !xetile.tile<32x32xtf32>, vector<32x32xf32>) {
       //CHECK: %[[r65:.*]] = vector.extract_strided_slice %[[arg8]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       //CHECK: %[[r66:.*]] = vector.extract_strided_slice %[[arg8]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
@@ -70,10 +70,10 @@ gpu.module @test_kernel {
       //CHECK: %[[r71:.*]] = vector.extract_strided_slice %[[arg9]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       //CHECK: %[[r72:.*]] = vector.extract_strided_slice %[[arg9]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
 
-      //CHECK: %[[r33:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x8xtf32>
+      //CHECK: %[[r33:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x8xtf32>
       //CHECK: %[[r34:.*]] = vector.extract %[[r33]][0] : vector<32x8xtf32> from vector<2x32x8xtf32>
       //CHECK: %[[r35:.*]] = vector.extract %[[r33]][1] : vector<32x8xtf32> from vector<2x32x8xtf32>
-      //CHECK: %[[r36:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x8xtf32>
+      //CHECK: %[[r36:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x8xtf32>
       //CHECK: %[[r37:.*]] = vector.extract %[[r36]][0] : vector<32x8xtf32> from vector<2x32x8xtf32>
       //CHECK: %[[r38:.*]] = vector.extract %[[r36]][1] : vector<32x8xtf32> from vector<2x32x8xtf32>
       //CHECK: %[[r39:.*]] = vector.extract_strided_slice %[[r34]] {offsets = [0, 0], sizes = [8, 8], strides = [1, 1]} : vector<32x8xtf32> to vector<8x8xtf32>
@@ -94,8 +94,8 @@ gpu.module @test_kernel {
       //CHECK: %[[r54:.*]] = vector.extract_strided_slice %[[r38]] {offsets = [24, 0], sizes = [8, 8], strides = [1, 1]} : vector<32x8xtf32> to vector<8x8xtf32>
       %7 = xetile.load_tile %arg4 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xtf32> -> vector<32x32xtf32>
 
-      //CHECK: %[[r55:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xtf32>
-      //CHECK: %[[r56:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x16xtf32>
+      //CHECK: %[[r55:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xtf32>
+      //CHECK: %[[r56:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xtf32>
       //CHECK: %[[r57:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32>
       //CHECK: %[[r58:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32>
       //CHECK: %[[r59:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32>
@@ -149,19 +149,19 @@ gpu.module @test_kernel {
 
       %9 = xetile.tile_mma %7, %8, %arg6 : vector<32x32xtf32>, vector<32x32xtf32>, vector<32x32xf32> -> vector<32x32xf32>
 
-      //CHECK: %[[r111:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r112:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r111:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r112:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
       %10 = xetile.update_tile_offset %arg4, [%c0,  %c64] : !xetile.tile<32x32xtf32>, index, index -> !xetile.tile<32x32xtf32>
 
-      //CHECK: %[[r113:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-      //CHECK: %[[r114:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r113:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r114:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %11 = xetile.update_tile_offset %arg5, [%c64,  %c0] : !xetile.tile<32x32xtf32>, index, index -> !xetile.tile<32x32xtf32>
 
       //CHECK: scf.yield %[[r111]], %[[r112]], %[[r113]], %[[r114]], %[[r107]], %[[r110]]
-      //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, vector<32x16xf32>, vector<32x16xf32>
+      //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
       scf.yield %10, %11, %9 : !xetile.tile<32x32xtf32>, !xetile.tile<32x32xtf32>, vector<32x32xf32>
     }
 
@@ -173,14 +173,14 @@ gpu.module @test_kernel {
     //CHECK: %[[r30:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     //CHECK: %[[r31:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     //CHECK: %[[r32:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
-    //CHECK: xegpu.store_nd %[[r25]], %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r29]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r26]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r30]], %[[r9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r27]], %[[r11]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r31]], %[[r12]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r28]], %[[r14]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r32]], %[[r15]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: xegpu.store_nd %[[r25]], %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r29]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r26]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r30]], %[[r9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r27]], %[[r11]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r31]], %[[r12]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r28]], %[[r14]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r32]], %[[r15]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %6#2,  %2 : vector<32x32xf32>, !xetile.tile<32x32xf32>
     //CHECK: gpu.return
     gpu.return
diff --git a/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir
index c3f153db1..2fb7cc259 100644
--- a/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir
@@ -8,10 +8,10 @@ gpu.module @test_kernel {
     //CHECK: %[[c64:.*]] = arith.constant 64 : index
     %c64 = arith.constant 64 : index
     //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]]
-    //CHECK-SAME: memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK-SAME: memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
   	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
     //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-    //CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
+    //CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
     %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
   	gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir b/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir
index b5aed2a49..d9640d474 100755
--- a/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir
@@ -56,7 +56,7 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp
       %26 = arith.muli %arg3, %c1024 : index
       %27 = arith.addi %26, %13 : index
       %28 = arith.addi %27, %16 : index
-      //CHECK: %{{.*}} = xegpu.create_nd_tdesc {{.*}} : memref<2048x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %{{.*}} = xegpu.create_nd_tdesc {{.*}} : memref<2048x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
       %29 = xetile.init_tile %arg1[%28, %15] : memref<2048x12288xbf16> -> !xetile.tile<32x32xbf16>
 
       %30 = scf.for %arg4 = %c0 to %c2 step %c1 iter_args(%arg5 = %cst) -> (vector<1x4xf32>) {
@@ -65,23 +65,23 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp
         %35 = arith.addi %34, %10 : index
         %36 = arith.addi %35, %11 : index
 
-        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
         %37 = xetile.init_tile %arg0[%36, %9] : memref<16384x12288xbf16> -> !xetile.tile<32x32xbf16>
         %38:3 = scf.for %arg6 = %c0 to %c12288 step %c32 iter_args(%arg7 = %37, %arg8 = %29, %arg9 = %cst_0) -> (!xetile.tile<32x32xbf16>, !xetile.tile<32x32xbf16>, vector<32x32xf32>) {
 
-          //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xbf16>
+          //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
           //CHECK-COUNT-2: %{{.*}} = vector.extract %{{.*}} : vector<32x16xbf16> from vector<2x32x16xbf16>
           //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16>
           %48 = xetile.load_tile %arg7 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xbf16> -> vector<32x32xbf16>
 
 
-          //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xbf16>
+          //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xbf16>
           //CHECK-COUNT-2: %{{.*}} = vector.extract %{{.*}} : vector<16x16x2xbf16> from vector<2x16x16x2xbf16>
           //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16>
           %49 = xetile.load_tile %arg8 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xbf16> -> vector<32x32xbf16>
 
-          //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-          //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+          //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+          //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
           %50 = xetile.update_tile_offset %arg7, [%c0,  %c32] : !xetile.tile<32x32xbf16>, index, index -> !xetile.tile<32x32xbf16>
           %51 = xetile.update_tile_offset %arg8, [%c0,  %c32] : !xetile.tile<32x32xbf16>, index, index -> !xetile.tile<32x32xbf16>
 
@@ -101,17 +101,17 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp
         %41 = vector.shape_cast %40 : vector<32xf32> to vector<1x32xf32>
         %alloc = memref.alloc() : memref<8x128xf32, 3>
 
-        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x16xf32, #xegpu.tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x16xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
         //CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : index
-        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x16xf32, #xegpu.tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x16xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
         %42 = xetile.init_tile %alloc[%17, %13] : memref<8x128xf32, 3> -> !xetile.tile<1x32xf32, #xetile.tile_attr<memory_scope = 3>>
 
         //CHECK-COUNT-2: vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
-        //CHECK-COUNT-2: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32, #xegpu.tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        //CHECK-COUNT-2: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
         xetile.store_tile %41,  %42 : vector<1x32xf32>, !xetile.tile<1x32xf32, #xetile.tile_attr<memory_scope = 3>>
 
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        //CHECK: xegpu.load_nd {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<8x4xf32>
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK: xegpu.load_nd {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<8x4xf32>
         //CHECK-COUNT-8: vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32>
         //CHECK-COUNT-8: arith.addf %{{.*}}, %{{.*}} : vector<1x4xf32>
         %43 = xetile.init_tile %alloc[%21, %23] : memref<8x128xf32, 3> -> !xetile.tile<8x4xf32, #xetile.tile_attr<memory_scope = 3>>
@@ -123,8 +123,8 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp
       } {lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 1>, step = 1 : index, syn.mm_dim = 0 : i64, syn.parall_level = 2 : i64, upperBoundMap = affine_map<() -> (2)>}
 
       //CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : index
-      //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<32x2048xf32> -> !xegpu.tensor_desc<1x4xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-      //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x4xf32>, !xegpu.tensor_desc<1x4xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<32x2048xf32> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x4xf32>, !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %31 = arith.addi %26, %16 : index
       %32 = xetile.init_tile %arg2[%25, %31] : memref<32x2048xf32> -> !xetile.tile<1x4xf32>
       xetile.store_tile %30,  %32 : vector<1x4xf32>, !xetile.tile<1x4xf32>
diff --git a/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir
index 7f065147d..a452e0454 100644
--- a/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir
@@ -12,19 +12,19 @@ gpu.module @test_kernel {
     %c64 = arith.constant 64 : index
     %c1024 = arith.constant 1024 : index
 
-    //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
   	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
 
-    //CHECK: %[[r1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg3:.*]] = %[[r0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, vector<32x32xf16>) {
+    //CHECK: %[[r1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg3:.*]] = %[[r0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>) {
     %nexta, %res = scf.for %k= %c0 to %c1024 step %c64 iter_args(%subA = %1, %subB = %cst) -> (!xetile.tile<32x32xf16>, vector<32x32xf16>) {
 
-      //CHECK: %[[r10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
+      //CHECK: %[[r10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %3 = xetile.load_tile %subA : !xetile.tile<32x32xf16> -> vector<32x32xf16>
 
-      //CHECK: %[[r11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %5 = xetile.update_tile_offset %subA, [%c0, %c64]: !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>
 
-      //CHECK: scf.yield %[[r11]], %[[r10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, vector<32x32xf16>
+      //CHECK: scf.yield %[[r11]], %[[r10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>
       scf.yield %5, %3: !xetile.tile<32x32xf16>, vector<32x32xf16>
     }
     //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[r1]]#1 {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
@@ -33,19 +33,19 @@ gpu.module @test_kernel {
     //CHECK: %[[r5:.*]] = vector.extract_strided_slice %[[r1]]#1 {offsets = [24, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
 
 
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
-    //CHECK: %[[r7:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r7:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
-    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
   	%5 = xetile.init_tile %b[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
 
-    //CHECK: xegpu.store_nd %[[r2]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r3]], %[[r7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r4]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[r5]], %[[r9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: xegpu.store_nd %[[r2]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r3]], %[[r7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r4]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r5]], %[[r9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %res, %5: vector<32x32xf16>, !xetile.tile<32x32xf16>
 
     //CHECK: gpu.return
diff --git a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
index 65899b751..58457381c 100644
--- a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
@@ -4,13 +4,13 @@ gpu.module @test_kernel {
     //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
     gpu.func @sglevel_softmax_dim_0(%a: memref<1024x1024xf16>) {
       //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       //CHECK: %[[c32:.*]] = arith.constant 32 : index
-      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
 
-      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
-      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
+      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
 
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
@@ -31,12 +31,12 @@ gpu.module @test_kernel {
     //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
     gpu.func @sglevel_softmax_dim_1(%a: memref<1024x1024xf16>) {
       //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       //CHECK: %[[c32:.*]] = arith.constant 32 : index
-      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
-      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
-      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
+      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir
index b3ea44e70..07d7111f4 100644
--- a/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir
@@ -12,28 +12,28 @@ gpu.module @test_kernel {
 		%result = arith.constant dense<0.0>: vector<32x32xf32>
     //CHECK: %[[c0:.*]] = arith.constant 0 : index
     //CHECK: %[[c32:.*]] = arith.constant 32 : index
-    //CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c48:.*]] = arith.constant 48 : index
-    //CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
-    //CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
-    //CHECK: %[[R8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[R9:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[R8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R9:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
-    //CHECK: %[[R10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: %[[R11:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[R10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R11:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
 		%1 = xetile.init_tile %a[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32>
 
-    //CHECK: xegpu.store_nd %[[R0]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[R0]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[R1]], %[[R6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[R1]], %[[R7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[R2]], %[[R8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[R2]], %[[R9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[R3]], %[[R10]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-    //CHECK: xegpu.store_nd %[[R3]], %[[R11]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    //CHECK: xegpu.store_nd %[[R0]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R0]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R1]], %[[R6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R1]], %[[R7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R2]], %[[R8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R2]], %[[R9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R3]], %[[R10]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R3]], %[[R11]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
 		xetile.store_tile %result, %1: vector<32x32xf32>, !xetile.tile<32x32xf32>
 		gpu.return
 	}
diff --git a/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir
index d92ee037e..c99ec99bd 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir
@@ -9,10 +9,10 @@ gpu.module @test_kernel {
     //CHECK: %[[c64:.*]] = arith.constant 64 : index
     %c64 = arith.constant 64 : index
 
-    //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
   	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
 
-    //CHECK: %[[r1:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xf16>
+    //CHECK: %[[r1:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
     //CHECK: %[[r2:.*]] = vector.extract %[[r1]][0] : vector<32x16xf16> from vector<2x32x16xf16>
     //CHECK: %[[r3:.*]] = vector.extract %[[r1]][1] : vector<32x16xf16> from vector<2x32x16xf16>
     //CHECK: %[[r4:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
@@ -24,15 +24,15 @@ gpu.module @test_kernel {
     //CHECK: %[[r10:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
     //CHECK: %[[r11:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
     %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c64]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c64]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
     //CHECK: %[[c32:.*]] = arith.constant 32 : index
-    //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %arg1[%[[c64]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %arg1[%[[c64]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
   	%3 = xetile.init_tile %b[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
 
-    //CHECK: %[[r14:.*]] = xegpu.load_nd %[[r12]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xf16>
+    //CHECK: %[[r14:.*]] = xegpu.load_nd %[[r12]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
     //CHECK: %[[r15:.*]] = vector.extract %[[r14]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
     //CHECK: %[[r16:.*]] = vector.extract %[[r14]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
-    //CHECK: %[[r17:.*]] = xegpu.load_nd %[[r13]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xf16>
+    //CHECK: %[[r17:.*]] = xegpu.load_nd %[[r13]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
     //CHECK: %[[r18:.*]] = vector.extract %[[r17]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
     //CHECK: %[[r19:.*]] = vector.extract %[[r17]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
     //CHECK: %[[r20:.*]] = vector.extract_strided_slice %[[r15]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xf16> to vector<8x16x2xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir
index 5f336d335..ea32986ef 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir
@@ -10,7 +10,7 @@ gpu.module @test_kernel {
     %3 = xetile.tile_unpack %2 { inner_blocks = [1, 16] }  : vector<32x4x1x16xf16> -> vector<32x64xf16>
     %4 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [1, 16]>>
     %5 = xetile.tile_pack %3 { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
-    // CHECK-COUNT-128: xegpu.store_nd %[[cst]], %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    // CHECK-COUNT-128: xegpu.store_nd %[[cst]], %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %5,  %4 : vector<32x4x1x16xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [1, 16]>>
     gpu.return
   }
@@ -87,7 +87,7 @@ gpu.module @test_kernel {
     %2 = xetile.tile_unpack %1 { inner_blocks = [1, 16] }  : vector<32x4x1x16xf16> -> vector<32x64xf16>
     %3 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [1, 16]>>
     %4 = xetile.tile_pack %2 { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
-    // CHECK-COUNT-128: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    // CHECK-COUNT-128: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %4,  %3 : vector<32x4x1x16xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [1, 16]>>
     gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir
index 04b2db65f..c7a56882c 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir
@@ -9,11 +9,11 @@ gpu.module @test_kernel {
     %c64 = arith.constant 64 : index
 
     // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[%[[C0]], %[[C64]]] : memref<1024x1024xf16>
-    // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+    // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
     %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
 
     // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-    // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
+    // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
     %1 = xetile.load_tile %0 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x1x32x32xf16>
     gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir
index b9b7b49c4..0f15fa7ff 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir
@@ -14,30 +14,30 @@
       // CHECK: %[[c1024:.*]] = arith.constant 1024 : index
       %c1024 = arith.constant 1024 : index
 
-      // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
 
       // CHECK: %[[R1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]]
-      // CHECK-SAME: iter_args(%[[arg3:.*]] = %[[R0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, vector<32x32xf16>)
+      // CHECK-SAME: iter_args(%[[arg3:.*]] = %[[R0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>)
       %1:2 = scf.for %arg2 = %c0 to %c1024 step %c64 iter_args(%arg3 = %0, %arg4 = %cst) -> (!xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>, vector<1x1x32x32xf16>) {
 
-        // CHECK: %[[R10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
+        // CHECK: %[[R10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
         %5 = xetile.load_tile %arg3 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x1x32x32xf16>
 
-        // CHECK: %[[R11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        // CHECK: %[[R11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %6 = xetile.update_tile_offset %arg3, [%c0,  %c64] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
 
-        // CHECK: scf.yield %[[R11]], %[[R10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, vector<32x32xf16>
+        // CHECK: scf.yield %[[R11]], %[[R10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>
         scf.yield %6, %5 : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>, vector<1x1x32x32xf16>
       }
 
-      // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c8:.*]] = arith.constant 8 : index
-      // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c16:.*]] = arith.constant 16 : index
-      // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c24:.*]] = arith.constant 24 : index
-      // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %2 = xetile.init_tile %arg1[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
 
       // CHECK: %[[R6:.*]] = vector.extract_strided_slice %[[R1]]#1 {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
@@ -47,10 +47,10 @@
       %3 = xetile.tile_unpack %1#1 { inner_blocks = [32, 32] }  : vector<1x1x32x32xf16> -> vector<32x32xf16>
       %4 = xetile.tile_pack %3 { inner_blocks = [8, 32] }  : vector<32x32xf16> -> vector<4x1x8x32xf16>
 
-      // CHECK: xegpu.store_nd %[[R6]], %[[R2]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-      // CHECK: xegpu.store_nd %[[R7]], %[[R3]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-      // CHECK: xegpu.store_nd %[[R8]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-      // CHECK: xegpu.store_nd %[[R9]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK: xegpu.store_nd %[[R6]], %[[R2]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: xegpu.store_nd %[[R7]], %[[R3]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: xegpu.store_nd %[[R8]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: xegpu.store_nd %[[R9]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %4,  %2 : vector<4x1x8x32xf16>, !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
       gpu.return
     }
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
index da41639bd..cd736f738 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
@@ -4,13 +4,13 @@ gpu.module @test_kernel {
     //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
     gpu.func @sglevel_softmax_dim_0(%arg0: memref<1024x1024xf16>) {
       //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       //CHECK: %[[c32:.*]] = arith.constant 32 : index
-      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %0 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
 
-      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
-      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
+      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %1 = xetile.load_tile %0 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x2x32x32xf16>
 
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
@@ -35,12 +35,12 @@ gpu.module @test_kernel {
     //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
     gpu.func @sglevel_softmax_dim_1(%arg0: memref<1024x1024xf16>) {
       //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       //CHECK: %[[c32:.*]] = arith.constant 32 : index
-      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %0 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
-      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
-      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<32x32xf16>
+      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %1 = xetile.load_tile %0 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x2x32x32xf16>
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir
index 1ca2a5549..c6829fbfe 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir
@@ -8,25 +8,25 @@
       // CHECK: %[[c0:.*]] = arith.constant 0 : index
       // CHECK: %[[c32:.*]] = arith.constant 32 : index
       // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c48:.*]] = arith.constant 48 : index
       // CHECK: %[[R1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c8:.*]] = arith.constant 8 : index
       // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c16:.*]] = arith.constant 16 : index
       // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c24:.*]] = arith.constant 24 : index
       // CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %0 = xetile.init_tile %arg0[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
 
       // CHECK: %[[R8:.*]] = vector.extract_strided_slice %[[cst]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
@@ -37,21 +37,21 @@
       %2 = xetile.tile_pack %1 { inner_blocks = [8, 16] }  : vector<32x32xf32> -> vector<4x2x8x16xf32>
 
       // CHECK: xegpu.store_nd %[[R8]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R8]], %[[R1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R9]], %[[R2]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R9]], %[[R3]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R10]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R10]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R11]], %[[R6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R11]], %[[R7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %2,  %0 : vector<4x2x8x16xf32>, !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
       gpu.return
     }
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir
index de76a2a62..a899e8773 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir
@@ -8,24 +8,24 @@ gpu.module @test_kernel {
     // CHECK: %[[C64:.*]] = arith.constant 64 : index
     %c64 = arith.constant 64 : index
 
-    // CHECK: %[[REG0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[C0]], %[[C64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    // CHECK: %[[REG0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[C0]], %[[C64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
     %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
 
-    // CHECK: %[[REG1:.*]] = xegpu.load_nd %[[REG0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xf16>
+    // CHECK: %[[REG1:.*]] = xegpu.load_nd %[[REG0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
     // CHECK: %[[REG2:.*]] = vector.extract %[[REG1]][0] : vector<32x16xf16> from vector<2x32x16xf16>
     // CHECK: %[[REG3:.*]] = vector.extract %[[REG1]][1] : vector<32x16xf16> from vector<2x32x16xf16>
     %1 = xetile.load_tile %0 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
 
 
-    // CHECK: %[[REG4:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    // CHECK: %[[REG4:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
     // CHECK: %[[C32:.*]] = arith.constant 32 : index
-    // CHECK: %[[REG5:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+    // CHECK: %[[REG5:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
     %2 = xetile.init_tile %arg1[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
 
-    // CHECK: %[[REG6:.*]] = xegpu.load_nd %[[REG4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xf16>
+    // CHECK: %[[REG6:.*]] = xegpu.load_nd %[[REG4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
     // CHECK: %[[REG7:.*]] = vector.extract %[[REG6]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
     // CHECK: %[[REG8:.*]] = vector.extract %[[REG6]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
-    // CHECK: %[[REG9:.*]] = xegpu.load_nd %[[REG5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xf16>
+    // CHECK: %[[REG9:.*]] = xegpu.load_nd %[[REG5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
     // CHECK: %[[REG10:.*]] = vector.extract %[[REG9]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
     // CHECK: %[[REG11:.*]] = vector.extract %[[REG9]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
     %3 = xetile.load_tile %2 { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x4x32x16xf16>
diff --git a/test/Conversion/XeTileToXeGPU/test_order.mlir b/test/Conversion/XeTileToXeGPU/test_order.mlir
index e5befc0a7..68a1b43ab 100644
--- a/test/Conversion/XeTileToXeGPU/test_order.mlir
+++ b/test/Conversion/XeTileToXeGPU/test_order.mlir
@@ -5,14 +5,14 @@
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[C16:.*]] = arith.constant 16 : index
 // CHECK: %[[R_CAST:.*]] = memref.reinterpret_cast %[[ARG1]] to offset: [0], sizes: [128, 64], strides: [64, 1] : memref<64x128xf16, strided<[1, 64]>> to memref<128x64xf16>
-// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C0]], %[[C0]]] : memref<128x64xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C16]], %[[C0]]] : memref<128x64xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-// CHECK: %[[T8:.*]] = xegpu.load_nd %[[T1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, transpose = array<i64: 1, 0>, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<8x16x2xf16>
-// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T2]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, transpose = array<i64: 1, 0>, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<8x16x2xf16>
-// CHECK: %[[T19:.*]] = xegpu.update_nd_offset %[[T1]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-// CHECK: %[[T20:.*]] = xegpu.update_nd_offset %[[T2]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-// CHECK: %[[T26:.*]] = xegpu.load_nd %[[T19]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, transpose = array<i64: 1, 0>, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<8x16x2xf16>
-// CHECK: %[[T27:.*]] = xegpu.load_nd %[[T20]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, transpose = array<i64: 1, 0>, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<8x16x2xf16>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C0]], %[[C0]]] : memref<128x64xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C16]], %[[C0]]] : memref<128x64xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+// CHECK: %[[T8:.*]] = xegpu.load_nd %[[T1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, transpose = array<i64: 1, 0>, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16x2xf16>
+// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T2]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, transpose = array<i64: 1, 0>, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16x2xf16>
+// CHECK: %[[T19:.*]] = xegpu.update_nd_offset %[[T1]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+// CHECK: %[[T20:.*]] = xegpu.update_nd_offset %[[T2]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+// CHECK: %[[T26:.*]] = xegpu.load_nd %[[T19]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, transpose = array<i64: 1, 0>, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16x2xf16>
+// CHECK: %[[T27:.*]] = xegpu.load_nd %[[T20]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, transpose = array<i64: 1, 0>, transpose_bit_width = 32 : i32}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16x2xf16>
 gpu.module @test_kernel {
 func.func @test_func(%A : memref<128x64xf16>, %B : memref<64x128xf16, strided<[1, 64], offset: 0>>) {
   %c0 = arith.constant 0 : index
diff --git a/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/test/Dialect/XeGPU/IR/XeGPUOps.mlir
index 19954e667..09ea28a35 100644
--- a/test/Dialect/XeGPU/IR/XeGPUOps.mlir
+++ b/test/Dialect/XeGPU/IR/XeGPUOps.mlir
@@ -24,11 +24,10 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
 func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {chunk_size = 2 : i64}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2}
-                          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, scattered = true>>
+  // CHECK: xegpu.create_tdesc %{{.*}} : ui64, vector<16xindex>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm, chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index>
+          -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm, chunk_size = 2>>
   return
 }
 
diff --git a/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir b/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
index d763e002b..c4bf4aea0 100644
--- a/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
+++ b/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
@@ -6,33 +6,33 @@
 
 // CHECK-LABEL: func @test_atomic_rmw({{.*}}) {
 func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16xf32>, %mask : vector<16xi1>) {
-  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.atomic_rmw
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32>
-  xegpu.atomic_rmw addf %1, %mask, %value : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32>
+  xegpu.atomic_rmw addf %1, %mask, %value : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
 
   return
 }
 
 // CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) {
 func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16x2xi1>) {
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
 
   // CHECK: xegpu.atomic_rmw
-  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16x2xi1>, vector<16x2xf32>
-  xegpu.atomic_rmw mulf %1, %mask, %value : !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16x2xi1>, vector<16x2xf32> -> vector<16x2xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16x2xi1>, vector<16x2xf32>
+  xegpu.atomic_rmw mulf %1, %mask, %value : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16x2xi1>, vector<16x2xf32> -> vector<16x2xf32>
 
   return
 }
 
 // CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) {
 func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16x2xi1>) {
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
 
   // CHECK: xegpu.atomic_rmw
-  // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr<scattered = true>>, vector<16x2xi1>, vector<16x2xi32>
-  xegpu.atomic_rmw andi %1, %mask, %value : !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr<scattered = true>>, vector<16x2xi1>, vector<16x2xi32> -> vector<16x2xi32>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16x2xi1>, vector<16x2xi32>
+  xegpu.atomic_rmw andi %1, %mask, %value : !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16x2xi1>, vector<16x2xi32> -> vector<16x2xi32>
 
   return
 }
diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
index 23e92b46e..e437622a6 100644
--- a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
+++ b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
@@ -65,9 +65,9 @@ func.func @test_create_nd_tdesc_4(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
-                                  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+                                  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
   return
 }
 
@@ -75,9 +75,9 @@ func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_6(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
-                            : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+                            : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
   return
 }
 
@@ -94,9 +94,9 @@ func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) {
 func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
   %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1]
-                                    : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+                                    : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
   return
 }
 
@@ -104,8 +104,8 @@ func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<64x128xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
   %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] : memref<?x?xf16>
-            -> !xegpu.tensor_desc<64x128xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+            -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
index 30a2ee66a..e24c15574 100644
--- a/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
+++ b/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
@@ -69,9 +69,9 @@ func.func @test_create_nd_tdesc_vc_5(%src: memref<?x?xf32>, %w : index, %h : ind
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
   // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
-                                  : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+                                  : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
   return
 }
 
@@ -80,9 +80,9 @@ func.func @test_create_nd_tdesc_vc_6(%src: memref<?x?xf32>, %w : index, %h : ind
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
   // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
-                            : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+                            : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
   return
 }
 
@@ -100,16 +100,16 @@ func.func @test_create_nd_tdesc_vc_7(%src: memref<1024xf32>, %offset : index) {
 func.func @test_create_nd_tdesc_vc_8(%src: memref<?x?xf32>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
   %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1]
-                                    : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+                                    : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
   return
 }
 
 // CHECK-LABEL: func @test_create_nd_tdesc_vc_9({{.*}}) {
 func.func @test_create_nd_tdesc_vc_9(%src: memref<8x32xf32>) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope =  slm, array_length = 2 : i64>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm, array_length = 2>>
+  // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 2 : i64>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm, array_length = 2>>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
index 245a37ba9..6f5b1a743 100644
--- a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
+++ b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
@@ -8,37 +8,35 @@
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
 func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
   // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   return
 }
 
 // CHECK-LABEL: func @test_create_tdesc_vc_2({{.*}}) {
 func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) {
   // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<memory_scope =  slm, scattered = true>>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope =  slm>>
   %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index>
-                            -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<memory_scope = slm, scattered = true>>
+                            -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm>>
   return
 }
 
 // CHECK-LABEL: func @test_create_tdesc_vc_3({{.*}}) {
 func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) {
   // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: {chunk_size = 8 : i64}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 8}
-                                          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index>
+            -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
   return
 }
 
 // CHECK-LABEL: func @test_create_tdesc_vc_4({{.*}}) {
 func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: {chunk_size = 2 : i64}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope =  slm, scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2}
-                        : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, scattered = true>>
+  // CHECK: xegpu.create_tdesc %arg0, %arg1 : ui64, vector<16xindex>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope =  slm, chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index>
+            -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm, chunk_size = 2>>
   return
 }
 
@@ -46,9 +44,9 @@ func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) {
 // CHECK-LABEL: func @test_create_tdesc_vc_5({{.*}}) {
 func.func @test_create_tdesc_vc_5(%src: memref<?xf32>, %offsets : vector<16 x index>) {
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {chunk_size = 2 : i64}
-  // CHECK-SAME: memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope =  slm, scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 2}
-              : memref<?xf32>, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, scattered = true>>
+  // CHECK-SAME: memref<?xf32>, vector<16xindex>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope =  slm, chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : memref<?xf32>, vector<16 x index>
+            -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm, chunk_size = 2>>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/invalid_vc.mlir b/test/Dialect/XeGPU/IR/invalid_vc.mlir
index db7526320..90b8887d2 100644
--- a/test/Dialect/XeGPU/IR/invalid_vc.mlir
+++ b/test/Dialect/XeGPU/IR/invalid_vc.mlir
@@ -50,21 +50,20 @@ func.func @test_create_nd_tdesc_vc_5(%input: memref<24x32x64xf32>) {
 func.func @test_create_tdesc(%src: ui64, %offsets : vector<16x8xindex>) {
   // expected-error@+1 {{operand #1 must be vector of index values of ranks 1}}
   %1 = xegpu.create_tdesc %src, %offsets
-                              : ui64, vector<16x8xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
+                              : ui64, vector<16x8xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<>>
   return
 }
 
 // -----
 func.func @test_load_gather(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16x8xi1>
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {chunk_size = 8}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 8}
-                              : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK: xegpu.create_tdesc {{.*}} : ui64, vector<16xindex>
+  // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
+        -> !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
 
   // expected-error@+1 {{failed to verify that all of {value, TensorDesc} have same rank}}
   %2 = xegpu.load %1, %0 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-                          : !xegpu.tensor_desc<16x8xf16, #xegpu.tdesc_attr<scattered = true>>, vector<16x8xi1> -> vector<8x8x4xf16>
+                          : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16x8xi1> -> vector<8x8x4xf16>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/test/Dialect/XeGPU/IR/load_gather_vc.mlir
index a5722ad60..68b202c38 100644
--- a/test/Dialect/XeGPU/IR/load_gather_vc.mlir
+++ b/test/Dialect/XeGPU/IR/load_gather_vc.mlir
@@ -9,13 +9,13 @@
 func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
   //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex>
-  //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-                : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+                : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   return
 }
 
@@ -23,15 +23,15 @@ func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
 func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16x8xi1>
 
-  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} {chunk_size = 8 : i64}
-  //CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 8}
-                : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64, vector<16xindex>
+  //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
+          -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
 
   //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}>
-  //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16x8xi1> -> vector<8x16xf32>
+  //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16x8xi1> -> vector<8x16xf32>
   %2 = xegpu.load %1, %0 {transpose = array<i64: 1, 0>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-               : !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16x8xi1> -> vector<8x16xf32>
+               : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16x8xi1> -> vector<8x16xf32>
   return
 }
 
@@ -39,13 +39,13 @@ func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
 func.func @test_load_gather_vc_4(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
 
-  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size = 1}
-                : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
+        -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-  //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+  //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-                : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+                : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/load_nd_vc.mlir b/test/Dialect/XeGPU/IR/load_nd_vc.mlir
index 202c93d60..2d4b08d1d 100644
--- a/test/Dialect/XeGPU/IR/load_nd_vc.mlir
+++ b/test/Dialect/XeGPU/IR/load_nd_vc.mlir
@@ -56,15 +56,15 @@ func.func @test_load_nd_simd_bf16(%src: ui64, %w : index, %h : index, %x : index
 // CHECK-LABEL: func @test_load_nd_block_array_simd_f16({{.*}}) {
 func.func @test_load_nd_block_array_simd_f16(%src: memref<8x32xf16>) {
   // CHECK: xegpu.create_nd_tdesc %{{.*}}[0, 0]
-  // CHECK-SAME: memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2 : i64>>
+  // CHECK-SAME: memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0]
-      : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+      : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
 
   // CHECK: xegpu.load_nd
   // CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2 : i64>> -> vector<2x8x16xf16>
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x8x16xf16>
   %2 = xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-              : !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
+              : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
   return
 }
 
diff --git a/test/Dialect/XeGPU/IR/store_scatter.mlir b/test/Dialect/XeGPU/IR/store_scatter.mlir
index 7e454867a..e41f46773 100644
--- a/test/Dialect/XeGPU/IR/store_scatter.mlir
+++ b/test/Dialect/XeGPU/IR/store_scatter.mlir
@@ -9,25 +9,25 @@ func.func @test_store_scatter(%src: ui64, %offsets : vector<16xindex>, %dst: ui6
   %0 = arith.constant dense<true>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
   // CHECK-SAME:
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   %1 = xegpu.create_tdesc %src, %offsets
-          : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+          : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.create_tdesc
   // CHECK-SAME:
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   %2 = xegpu.create_tdesc %dst, %offsets
-                : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+                : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.load
   // CHECK-SAME: {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   %3 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-                  : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+                  : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   // CHECK: xegpu.store
   // CHECK-SAME: {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}
-  // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+  // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
   xegpu.store %3, %2, %0 {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}
-                  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+                  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
index 493e54a6a..df304e739 100644
--- a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
+++ b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
@@ -8,24 +8,23 @@
 func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src, %offsets
-          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   %2 = xegpu.create_tdesc %dst, %offsets
-          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.load
   // CHECK-SAME: {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   %3 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-                : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+                : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   // CHECK: xegpu.store
   // CHECK-SAME: {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}
-  // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+  // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
   xegpu.store %3, %2, %0 {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}
-                : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+                : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/update_offset_vc.mlir b/test/Dialect/XeGPU/IR/update_offset_vc.mlir
index 78f25e71e..164319a83 100644
--- a/test/Dialect/XeGPU/IR/update_offset_vc.mlir
+++ b/test/Dialect/XeGPU/IR/update_offset_vc.mlir
@@ -8,22 +8,22 @@
 func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   %1 = xegpu.create_tdesc %src, %offsets
-              : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+              : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.load
   // CHECK-SAME: {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
   %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-        : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+        : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
 
   %3 = arith.constant dense<16>: vector<16 x index>
 
   // CHECK: xegpu.update_offset
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   %5 = xegpu.update_offset %1, %3
-      : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+      : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   return
 }
diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir
index ae7954564..6d1c6f55e 100644
--- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16.mlir
@@ -150,12 +150,12 @@ module @gemm attributes {gpu.container_module} {
 
 
       // two 32x16 A tiles from 256x32 WG slice
-      %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length =2>>
+      %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
       // %A_sg_init_tile_1 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c16] : memref<4096x4096xf16>
 
       //create B tiles
-      %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
-      %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] :  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr< array_length = 2>>
+      %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+      %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] :  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
       // %B_sg_init_tile_2 = xegpu.update_nd_offset %B_sg_init_tile_1, [%c0, %c16] : !xegpu.tensor_desc<32x16xf16>
       // %B_sg_init_tile_3 = xegpu.update_nd_offset %B_sg_init_tile_2, [%c0, %c16] : !xegpu.tensor_desc<32x16xf16>
 
@@ -213,9 +213,9 @@ module @gemm attributes {gpu.container_module} {
           %A_prefetch_tile = %A_sg_prefetch_tile_iter3,
           %B_prefetch_tile = %B_sg_prefetch_tile_iter3
           ) ->
-          (!xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-          !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-          !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
+          (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+          !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+          !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
           vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,
           !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16>
           )
@@ -228,13 +228,13 @@ module @gemm attributes {gpu.container_module} {
           xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
         }
         // load A tiles
-        %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
+        %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
         %a_val_0 = vector.extract %a_val [0] : vector<32x16xf16> from vector<2x32x16xf16>
         %a_val_1 = vector.extract %a_val [1] : vector<32x16xf16> from vector<2x32x16xf16>
 
         // load B tiles
-        %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
-        %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
+        %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
+        %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
 
         %b_val_0 = vector.extract %b_val_arr_0 [0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
         %b_val_1 = vector.extract %b_val_arr_0 [1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
@@ -254,11 +254,11 @@ module @gemm attributes {gpu.container_module} {
         %next_A_prefetch_tile = xegpu.update_nd_offset %A_prefetch_tile, [%c0, %c32] : !xegpu.tensor_desc<8x32xf16>
         %next_B_prefetch_tile = xegpu.update_nd_offset %B_prefetch_tile, [%c32, %c0] : !xegpu.tensor_desc<8x32xf16>
         // advance A and B tiles
-        %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+        %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
         // %next_A_tile_1 = xegpu.update_nd_offset %A_tile_1, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16>
 
-        %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
-        %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+        %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+        %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
         // %next_B_tile_2 = xegpu.update_nd_offset %B_tile_2, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16>
         // %next_B_tile_3 = xegpu.update_nd_offset %B_tile_3, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16>
 
@@ -368,9 +368,9 @@ module @gemm attributes {gpu.container_module} {
         scf.yield %next_A_tile_0, %next_B_tile_0, %next_B_tile_1,
                   %new_c_val_0_0, %new_c_val_0_1, %new_c_val_0_2, %new_c_val_0_3, %new_c_val_1_0, %new_c_val_1_1, %new_c_val_1_2, %new_c_val_1_3, %new_c_val_2_0, %new_c_val_2_1, %new_c_val_2_2, %new_c_val_2_3, %new_c_val_3_0, %new_c_val_3_1, %new_c_val_3_2, %new_c_val_3_3,
                   %next_A_prefetch_tile, %next_B_prefetch_tile
-                  : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-                  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-                  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
+                  : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+                  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+                  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
                   vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,
                   !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16>
       }
diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir
index a65503775..0850e3f83 100644
--- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_8x32xf16_stores.mlir
@@ -150,12 +150,12 @@ module @gemm attributes {gpu.container_module} {
 
 
       // two 32x16 A tiles from 256x32 WG slice
-      %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length =2>>
+      %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
       // %A_sg_init_tile_1 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c16] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16>
 
       //create B tiles
-      %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
-      %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] :  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr< array_length = 2>>
+      %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+      %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] :  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
       // %B_sg_init_tile_2 = xegpu.update_nd_offset %B_sg_init_tile_1, [%c0, %c16] :  !xegpu.tensor_desc<32x16xf16>
       // %B_sg_init_tile_3 = xegpu.update_nd_offset %B_sg_init_tile_2, [%c0, %c16] :  !xegpu.tensor_desc<32x16xf16>
 
@@ -213,9 +213,9 @@ module @gemm attributes {gpu.container_module} {
           %A_prefetch_tile = %A_sg_prefetch_tile_iter3,
           %B_prefetch_tile = %B_sg_prefetch_tile_iter3
           ) ->
-          (!xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-          !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-          !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
+          (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+          !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+          !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
           vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,
           !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16>
           )
@@ -228,13 +228,13 @@ module @gemm attributes {gpu.container_module} {
           xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
         }
         // load A tiles
-        %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
+        %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
         %a_val_0 = vector.extract %a_val [0] : vector<32x16xf16> from vector<2x32x16xf16>
         %a_val_1 = vector.extract %a_val [1] : vector<32x16xf16> from vector<2x32x16xf16>
 
         // load B tiles
-        %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
-        %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
+        %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
+        %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
 
         %b_val_0 = vector.extract %b_val_arr_0 [0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
         %b_val_1 = vector.extract %b_val_arr_0 [1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
@@ -254,11 +254,11 @@ module @gemm attributes {gpu.container_module} {
         %next_A_prefetch_tile = xegpu.update_nd_offset %A_prefetch_tile, [%c0, %c32] : !xegpu.tensor_desc<8x32xf16>
         %next_B_prefetch_tile = xegpu.update_nd_offset %B_prefetch_tile, [%c32, %c0] : !xegpu.tensor_desc<8x32xf16>
         // advance A and B tiles
-        %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+        %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
         // %next_A_tile_1 = xegpu.update_nd_offset %A_tile_1, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16>
 
-        %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
-        %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+        %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+        %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
         // %next_B_tile_2 = xegpu.update_nd_offset %B_tile_2, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16>
         // %next_B_tile_3 = xegpu.update_nd_offset %B_tile_3, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16>
 
@@ -368,9 +368,9 @@ module @gemm attributes {gpu.container_module} {
         scf.yield %next_A_tile_0, %next_B_tile_0, %next_B_tile_1,
                   %new_c_val_0_0, %new_c_val_0_1, %new_c_val_0_2, %new_c_val_0_3, %new_c_val_1_0, %new_c_val_1_1, %new_c_val_1_2, %new_c_val_1_3, %new_c_val_2_0, %new_c_val_2_1, %new_c_val_2_2, %new_c_val_2_3, %new_c_val_3_0, %new_c_val_3_1, %new_c_val_3_2, %new_c_val_3_3,
                   %next_A_prefetch_tile, %next_B_prefetch_tile
-                  : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-                  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-                  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
+                  : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+                  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+                  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
                   vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,
                   !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16>
       }
diff --git a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir
index b65ed81c3..9de6ec890 100644
--- a/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir
+++ b/test/Integration/Dialect/XeGPU/gemm_4kx4kx4k_f16_f16_f16_w_simple_B_prefetch.mlir
@@ -146,12 +146,12 @@ module @gemm attributes {gpu.container_module} {
 
 
       // two 32x16 A tiles from 256x32 WG slice
-      %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length =2>>
+      %A_sg_init_tile_0 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length =2>>
       // %A_sg_init_tile_1 = xegpu.create_nd_tdesc %A[%C_sg_tile_offset_x, %c16] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16>
 
       //create B tiles
-      %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
-      %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] :  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr< array_length = 2>>
+      %B_sg_init_tile_0 = xegpu.create_nd_tdesc %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+      %B_sg_init_tile_1 = xegpu.update_nd_offset %B_sg_init_tile_0, [%c0, %c32] :  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr< array_length = 2>>
       // %B_sg_init_tile_2 = xegpu.update_nd_offset %B_sg_init_tile_1, [%c0, %c16] :  !xegpu.tensor_desc<32x16xf16>
       // %B_sg_init_tile_3 = xegpu.update_nd_offset %B_sg_init_tile_2, [%c0, %c16] :  !xegpu.tensor_desc<32x16xf16>
 
@@ -209,9 +209,9 @@ module @gemm attributes {gpu.container_module} {
           %A_prefetch_tile = %A_sg_prefetch_tile_iter3,
           %B_prefetch_tile = %B_sg_prefetch_tile_iter3
           ) ->
-          (!xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-          !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-          !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
+          (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+          !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+          !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
           vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,
           !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16>
           )
@@ -224,13 +224,13 @@ module @gemm attributes {gpu.container_module} {
           xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
         }
         // load A tiles
-        %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
+        %a_val = xegpu.load_nd %A_tile_0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
         %a_val_0 = vector.extract %a_val [0] : vector<32x16xf16> from vector<2x32x16xf16>
         %a_val_1 = vector.extract %a_val [1] : vector<32x16xf16> from vector<2x32x16xf16>
 
         // load B tiles
-        %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
-        %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
+        %b_val_arr_0 = xegpu.load_nd %B_tile_0 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
+        %b_val_arr_1 = xegpu.load_nd %B_tile_1 {packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16x2xf16>
 
         %b_val_0 = vector.extract %b_val_arr_0 [0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
         %b_val_1 = vector.extract %b_val_arr_0 [1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
@@ -250,11 +250,11 @@ module @gemm attributes {gpu.container_module} {
         %next_A_prefetch_tile = xegpu.update_nd_offset %A_prefetch_tile, [%c0, %c32] : !xegpu.tensor_desc<8x32xf16>
         %next_B_prefetch_tile = xegpu.update_nd_offset %B_prefetch_tile, [%c32, %c0] : !xegpu.tensor_desc<8x32xf16>
         // advance A and B tiles
-        %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+        %next_A_tile_0 = xegpu.update_nd_offset %A_tile_0, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
         // %next_A_tile_1 = xegpu.update_nd_offset %A_tile_1, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16>
 
-        %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
-        %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+        %next_B_tile_0 = xegpu.update_nd_offset %B_tile_0, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+        %next_B_tile_1 = xegpu.update_nd_offset %B_tile_1, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
         // %next_B_tile_2 = xegpu.update_nd_offset %B_tile_2, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16>
         // %next_B_tile_3 = xegpu.update_nd_offset %B_tile_3, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16>
 
@@ -364,9 +364,9 @@ module @gemm attributes {gpu.container_module} {
         scf.yield %next_A_tile_0, %next_B_tile_0, %next_B_tile_1,
                   %new_c_val_0_0, %new_c_val_0_1, %new_c_val_0_2, %new_c_val_0_3, %new_c_val_1_0, %new_c_val_1_1, %new_c_val_1_2, %new_c_val_1_3, %new_c_val_2_0, %new_c_val_2_1, %new_c_val_2_2, %new_c_val_2_3, %new_c_val_3_0, %new_c_val_3_1, %new_c_val_3_2, %new_c_val_3_3,
                   %next_A_prefetch_tile, %next_B_prefetch_tile
-                  : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-                  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
-                  !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>,
+                  : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+                  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
+                  !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>,
                   vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,vector<8x16xf32>,
                   !xegpu.tensor_desc<8x32xf16>, !xegpu.tensor_desc<8x32xf16>
       }
diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir
index ad318ecd3..832cf06a2 100644
--- a/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/load_with_block_array_16_16_2.vc.mlir
@@ -19,8 +19,8 @@ module @gemm attributes {gpu.container_module} {
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
    gpu.func @test_kernel(%arg0: memref<16x32xf16>, %arg1: memref<16x32xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
-      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<16x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<array_length = 2>>
-      %1 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<16x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x16x16xf16>
+      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<16x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+      %1 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16xf16>
       %3 = arith.extf %1: vector<2x16x16xf16> to vector<2x16x16xf32>
       %4 = vector.extract %3[0]: vector<16x16xf32> from vector<2x16x16xf32>
       %5 = vector.extract %3[1]: vector<16x16xf32> from vector<2x16x16xf32>
diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir
index b672f457c..cd3f7ffc4 100644
--- a/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/load_with_block_array_32_16_2.vc.mlir
@@ -18,9 +18,9 @@ module @gemm attributes {gpu.container_module} {
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
    gpu.func @test_kernel(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
-      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x32xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x32xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
       %1 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}
-                  : !xegpu.tensor_desc<32x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
+                  : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
       %3 = arith.extf %1: vector<2x32x16xf16> to vector<2x32x16xf32>
       %4 = vector.extract %3[0]: vector<32x16xf32> from vector<2x32x16xf32>
       %5 = vector.extract %3[1]: vector<32x16xf32> from vector<2x32x16xf32>
diff --git a/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir b/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir
index 72a06b2e6..1c3e463c8 100644
--- a/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/load_with_block_array_8_16_2.vc.mlir
@@ -19,10 +19,10 @@ module @gemm attributes {gpu.container_module} {
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
     gpu.func @test_kernel(%arg0: memref<8x32xf16>, %arg1: memref<8x32xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
-      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
       %1 = xegpu.create_nd_tdesc %arg1[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32>
       %2 = xegpu.create_nd_tdesc %arg1[0, 16] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32>
-      %3 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
+      %3 = xegpu.load_nd %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
       %4 = vector.extract %3[0]: vector<8x16xf16> from vector<2x8x16xf16>
       %5 = vector.extract %3[1]: vector<8x16xf16> from vector<2x8x16xf16>
       %8 = arith.extf %4: vector<8x16xf16> to vector<8x16xf32>
diff --git a/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir
index 084a58c3f..1fd95155f 100644
--- a/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir
@@ -44,21 +44,21 @@ module @gemm attributes {gpu.container_module} {
       // Spirv has no lowering for memref.reinterpret_cast with different sizes (doesn't work: memref<3x16xf32> to memref<16xf32>)
       // Each row has a tdesc with offsets that determine linearized memref's values to be loaded
       %offsets_row1 = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex>
-      %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+      %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
       %row_1_store = arith.select %row_mask, %row_1_loaded, %user_val : vector<16xi1>, vector<16xf32>
-      xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
 
-      %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
+      %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
       %row_2_store = arith.select %row_mask, %row_2_loaded, %user_val : vector<16xi1>, vector<16xf32>
-      xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
 
       // The entire row is out of bounds
-      %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/loadgather2d_masked_slm_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather2d_masked_slm_f32.mlir
index 2f4f6c46f..4713f5cae 100644
--- a/test/Integration/Dialect/XeGPU/loadgather2d_masked_slm_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather2d_masked_slm_f32.mlir
@@ -44,21 +44,21 @@ module @gemm attributes {gpu.container_module} {
       // Spirv has no lowering for memref.reinterpret_cast with different sizes (doesn't work: memref<3x16xf32> to memref<16xf32>)
       // Each row has a tdesc with offsets that determine linearized memref's values to be loaded
       %offsets_row1 = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex>
-      %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1> -> vector<16xf32>
+      %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1> -> vector<16xf32>
       %row_1_store = arith.select %row_mask, %row_1_loaded, %user_val : vector<16xi1>, vector<16xf32>
-      xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1>
+      xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1>
 
-      %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1> -> vector<16xf32>
+      %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1> -> vector<16xf32>
       %row_2_store = arith.select %row_mask, %row_2_loaded, %user_val : vector<16xi1>, vector<16xf32>
-      xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1>
+      xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1>
 
       // The entire row is out of bounds
-      %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1>
+      %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir
index 575fc4c36..21c54e674 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir
@@ -34,10 +34,10 @@ module @gemm attributes {gpu.container_module} {
       // Valid offsets (%offsets for which %mask is 1) should not exceed 16*2=32.
       %offsets = arith.constant dense<[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58]> : vector<16xindex>
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1>
-      %tdesc_in = xegpu.create_tdesc %in, %offsets {chunk_size = 2} : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<scattered = true>>
-      %tdesc_out = xegpu.create_tdesc %out, %offsets {chunk_size = 2} : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<scattered = true>>
-      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16x2xf32>
-      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      %tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+      %tdesc_out = xegpu.create_tdesc %out, %offsets : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xf32>
+      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir
index 2175f0e4b..c99ec4224 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir
@@ -34,10 +34,10 @@ module @gemm attributes {gpu.container_module} {
       // Valid offsets (%offsets for which %mask is 1) should not exceed 16*2=32.
       %offsets = arith.constant dense<[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58]> : vector<16xindex>
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1>
-      %tdesc_in = xegpu.create_tdesc %in, %offsets {chunk_size = 2} : memref<?xi32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr<scattered = true>>
-      %tdesc_out = xegpu.create_tdesc %out, %offsets {chunk_size = 2} : memref<?xi32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr<scattered = true>>
-      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16x2xi32>
-      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      %tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xi32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+      %tdesc_out = xegpu.create_tdesc %out, %offsets : memref<?xi32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xi32>
+      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_f32.mlir
index 452d7d7e7..88e2cbf8c 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_f32.mlir
@@ -28,10 +28,10 @@ module @gemm attributes {gpu.container_module} {
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1>
       %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
       %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
-      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
-      xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+      xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir
index e7dc1dd74..349576169 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir
@@ -28,10 +28,10 @@ module @gemm attributes {gpu.container_module} {
       %mask = arith.constant dense<[1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1]> : vector<16xi1>
       %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
       %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
-      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-      %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1> -> vector<16xf32>
-      xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>
+      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+      xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_masked_slm_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_masked_slm_f32.mlir
index bf28c6afc..dc0686165 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_masked_slm_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_masked_slm_f32.mlir
@@ -28,10 +28,10 @@ module @gemm attributes {gpu.container_module} {
       %mask = arith.constant dense<[1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1]> : vector<16xi1>
       %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
       %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
-      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1> -> vector<16xf32>
-      xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1>
+      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1> -> vector<16xf32>
+      xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_slm_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_slm_f32.mlir
index 4d43d67dc..6fdcdf149 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_slm_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_slm_f32.mlir
@@ -29,10 +29,10 @@ module @gemm attributes {gpu.container_module} {
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1>
       %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
       %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
-      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>
-      %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1> -> vector<16xf32>
-      xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true, memory_scope=slm>>, vector<16xi1>
+      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>
+      %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1> -> vector<16xf32>
+      xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=slm>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir
index 5803b748c..60e7b5708 100644
--- a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir
@@ -22,8 +22,8 @@ module @gemm attributes {gpu.container_module} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
       // load tile
-      %tile = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<32x16xf32> -> !xegpu.tensor_desc<32x8xf32, #xegpu.tdesc_attr<array_length = 2>>
-      %value = xegpu.load_nd %tile : !xegpu.tensor_desc<32x8xf32, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x32x8xf32>
+      %tile = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<32x16xf32> -> !xegpu.tensor_desc<32x8xf32, #xegpu.block_tdesc_attr<array_length = 2>>
+      %value = xegpu.load_nd %tile : !xegpu.tensor_desc<32x8xf32, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x32x8xf32>
       // extract the bottom 8x8 part of first 32x8 block
       %sub_tile0 = vector.extract_strided_slice %value { offsets = [0, 24], strides = [1, 1], sizes = [1, 8] } : vector<2x32x8xf32> to vector<1x8x8xf32>
       // extract the bottom 8x8 part of second 32x8 block
diff --git a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
index 9d969b3d2..7758ce094 100644
--- a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
+++ b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
@@ -53,11 +53,11 @@ module {
       %26 = arith.muli %25, %c256 : index
       %27 = arith.divsi %15, %c32 : index
       %28 = arith.muli %27, %c32 : index
-      %29 = xegpu.create_nd_tdesc %arg0[%19, %15] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+      %29 = xegpu.create_nd_tdesc %arg0[%19, %15] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
       %30 = arith.divsi %23, %c32 : index
       %31 = arith.muli %30, %c32 : index
       %32 = arith.addi %26, %2 : index
-      %33 = xegpu.create_nd_tdesc %arg0[%32, %28] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      %33 = xegpu.create_nd_tdesc %arg0[%32, %28] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %34 = arith.remsi %11, %c4 : index
       %35 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) {
         %39 = vector.shape_cast %arg4 : vector<8x1xf32> to vector<8xf32>
@@ -75,29 +75,29 @@ module {
         %50 = arith.addi %49, %24 : index
         %51 = arith.divsi %50, %c128 : index
         %52 = arith.muli %51, %c128 : index
-        %53 = xegpu.create_nd_tdesc %arg1[%50, %23] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %33 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %54 = xegpu.update_nd_offset %33, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %54 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %55 = xegpu.update_nd_offset %54, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %55 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %56 = xegpu.update_nd_offset %55, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %53 = xegpu.create_nd_tdesc %arg1[%50, %23] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %33 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %54 = xegpu.update_nd_offset %33, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %54 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %55 = xegpu.update_nd_offset %54, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %55 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %56 = xegpu.update_nd_offset %55, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %57 = arith.addi %52, %3 : index
-        %58 = xegpu.create_nd_tdesc %arg1[%57, %31] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %58 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %59 = xegpu.update_nd_offset %58, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %59 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %60 = xegpu.update_nd_offset %59, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %60 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %61 = xegpu.update_nd_offset %60, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %62:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %29, %arg7 = %53, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %56, %arg17 = %61, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, index) {
+        %58 = xegpu.create_nd_tdesc %arg1[%57, %31] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %58 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %59 = xegpu.update_nd_offset %58, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %59 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %60 = xegpu.update_nd_offset %59, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %60 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %61 = xegpu.update_nd_offset %60, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %62:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %29, %arg7 = %53, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %56, %arg17 = %61, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index) {
           %391 = arith.cmpi eq, %arg18, %c21 : index
           %392 = arith.select %391, %c0, %arg18 : index
           scf.if %391 {
             gpu.barrier
           }
           %393 = arith.addi %392, %c1 : index
-          %394 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xbf16>
+          %394 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
           %395 = vector.shape_cast %394 : vector<2x32x16xbf16> to vector<1024xbf16>
           %396 = vector.shuffle %395, %395 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16>
           %397 = vector.shuffle %395, %395 [512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023] : vector<1024xbf16>, vector<1024xbf16>
@@ -117,7 +117,7 @@ module {
           %411 = vector.shape_cast %410 : vector<128xbf16> to vector<8x16xbf16>
           %412 = vector.shuffle %397, %397 [384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<512xbf16>, vector<512xbf16>
           %413 = vector.shape_cast %412 : vector<128xbf16> to vector<8x16xbf16>
-          %414 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xbf16>
+          %414 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xbf16>
           %415 = vector.shape_cast %414 : vector<2x16x16x2xbf16> to vector<1024xbf16>
           %416 = vector.shuffle %415, %415 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16>
           %417 = vector.shuffle %415, %415 [512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023] : vector<1024xbf16>, vector<1024xbf16>
@@ -130,15 +130,15 @@ module {
           %424 = vector.shuffle %417, %417 [256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<512xbf16>, vector<512xbf16>
           %425 = vector.shape_cast %424 : vector<256xbf16> to vector<8x16x2xbf16>
           xegpu.compile_hint
-          xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-          xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+          xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+          xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
           xegpu.compile_hint
-          %426 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-          %427 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+          %426 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+          %427 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
           xegpu.compile_hint
           xegpu.compile_hint
-          %428 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-          %429 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+          %428 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+          %429 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
           xegpu.compile_hint
           %430 = xegpu.dpas %399, %419, %arg8 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           %431 = xegpu.dpas %407, %421, %430 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
@@ -157,7 +157,7 @@ module {
           %444 = xegpu.dpas %405, %423, %arg15 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           %445 = xegpu.dpas %413, %425, %444 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           xegpu.compile_hint
-          scf.yield %428, %429, %431, %433, %435, %437, %439, %441, %443, %445, %426, %427, %393 : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, index
+          scf.yield %428, %429, %431, %433, %435, %437, %439, %441, %443, %445, %426, %427, %393 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index
         }
         %63 = vector.shape_cast %62#2 : vector<8x16xf32> to vector<128xf32>
         %64 = vector.shape_cast %62#3 : vector<8x16xf32> to vector<128xf32>
@@ -419,13 +419,13 @@ module {
         %320 = arith.addf %318, %319 : vector<16xf32>
         %321 = vector.shuffle %317, %320 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
         %alloc = memref.alloc() : memref<256x4xf32, #spirv.storage_class<Workgroup>>
-        %322 = xegpu.create_nd_tdesc %alloc[%13, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %322 = xegpu.create_nd_tdesc %alloc[%13, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %323 = arith.addi %13, %c8 : index
-        %324 = xegpu.create_nd_tdesc %alloc[%323, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %324 = xegpu.create_nd_tdesc %alloc[%323, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %325 = arith.addi %13, %c16 : index
-        %326 = xegpu.create_nd_tdesc %alloc[%325, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %326 = xegpu.create_nd_tdesc %alloc[%325, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %327 = arith.addi %13, %c24 : index
-        %328 = xegpu.create_nd_tdesc %alloc[%327, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %328 = xegpu.create_nd_tdesc %alloc[%327, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %329 = vector.shuffle %321, %321 [0, 1, 2, 3, 4, 5, 6, 7] : vector<32xf32>, vector<32xf32>
         %330 = vector.shape_cast %329 : vector<8xf32> to vector<8x1xf32>
         %331 = vector.shuffle %321, %321 [8, 9, 10, 11, 12, 13, 14, 15] : vector<32xf32>, vector<32xf32>
@@ -434,13 +434,13 @@ module {
         %334 = vector.shape_cast %333 : vector<8xf32> to vector<8x1xf32>
         %335 = vector.shuffle %321, %321 [24, 25, 26, 27, 28, 29, 30, 31] : vector<32xf32>, vector<32xf32>
         %336 = vector.shape_cast %335 : vector<8xf32> to vector<8x1xf32>
-        xegpu.store_nd %330, %322 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.store_nd %332, %324 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.store_nd %334, %326 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.store_nd %336, %328 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        xegpu.store_nd %330, %322 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %332, %324 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %334, %326 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %336, %328 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         gpu.barrier
-        %337 = xegpu.create_nd_tdesc %alloc[%9, %c0] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %338 = xegpu.load_nd %337 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<8x4xf32>
+        %337 = xegpu.create_nd_tdesc %alloc[%9, %c0] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %338 = xegpu.load_nd %337 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x4xf32>
         %339 = vector.shape_cast %338 : vector<8x4xf32> to vector<32xf32>
         %340 = vector.shuffle %339, %339 [0, 1, 2, 3] : vector<32xf32>, vector<32xf32>
         %341 = vector.shuffle %339, %339 [4, 5, 6, 7] : vector<32xf32>, vector<32xf32>
@@ -499,8 +499,8 @@ module {
       }
       %36 = arith.addi %16, %18 : index
       %37 = arith.addi %36, %9 : index
-      %38 = xegpu.create_nd_tdesc %arg2[%37, %7] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-      xegpu.store_nd %35, %38 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      %38 = xegpu.create_nd_tdesc %arg2[%37, %7] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      xegpu.store_nd %35, %38 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       gpu.return
     }
   }
diff --git a/test/Transforms/VectorLinearize/postop_reduce_n.mlir b/test/Transforms/VectorLinearize/postop_reduce_n.mlir
index 4074eff3d..09f28d414 100644
--- a/test/Transforms/VectorLinearize/postop_reduce_n.mlir
+++ b/test/Transforms/VectorLinearize/postop_reduce_n.mlir
@@ -56,13 +56,13 @@ module {
       %28 = arith.muli %27, %c32 : index
       %29 = arith.addi %19, %c0 : index
       %30 = arith.addi %15, %c0 : index
-      %31 = xegpu.create_nd_tdesc %arg0[%29, %30] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+      %31 = xegpu.create_nd_tdesc %arg0[%29, %30] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
       %32 = arith.divsi %23, %c32 : index
       %33 = arith.muli %32, %c32 : index
       %34 = arith.addi %26, %2 : index
       %35 = arith.addi %34, %c0 : index
       %36 = arith.addi %28, %c0 : index
-      %37 = xegpu.create_nd_tdesc %arg0[%35, %36] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      %37 = xegpu.create_nd_tdesc %arg0[%35, %36] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       %38 = arith.remsi %11, %c4 : index
       %39 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) {
 
@@ -89,31 +89,31 @@ module {
         %57 = arith.muli %56, %c128 : index
         %58 = arith.addi %55, %c0 : index
         %59 = arith.addi %23, %c0 : index
-        %60 = xegpu.create_nd_tdesc %arg1[%58, %59] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %37 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %61 = xegpu.update_nd_offset %37, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %61 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %62 = xegpu.update_nd_offset %61, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %62 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %63 = xegpu.update_nd_offset %62, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %60 = xegpu.create_nd_tdesc %arg1[%58, %59] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %37 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %61 = xegpu.update_nd_offset %37, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %61 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %62 = xegpu.update_nd_offset %61, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %62 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %63 = xegpu.update_nd_offset %62, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %64 = arith.addi %57, %3 : index
         %65 = arith.addi %64, %c0 : index
         %66 = arith.addi %33, %c0 : index
-        %67 = xegpu.create_nd_tdesc %arg1[%65, %66] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %67 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %68 = xegpu.update_nd_offset %67, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %68 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %69 = xegpu.update_nd_offset %68, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.prefetch_nd %69 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %70 = xegpu.update_nd_offset %69, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %71:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %31, %arg7 = %60, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %63, %arg17 = %70, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, index) {
+        %67 = xegpu.create_nd_tdesc %arg1[%65, %66] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %67 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %68 = xegpu.update_nd_offset %67, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %68 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %69 = xegpu.update_nd_offset %68, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %69 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %70 = xegpu.update_nd_offset %69, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %71:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %31, %arg7 = %60, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %63, %arg17 = %70, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index) {
           %437 = arith.cmpi eq, %arg18, %c21 : index
           %438 = arith.select %437, %c0, %arg18 : index
           scf.if %437 {
             gpu.barrier
           }
           %439 = arith.addi %438, %c1 : index
-          %440 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x32x16xbf16>
+          %440 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
 
           //CHECK: vector.shape_cast %{{.*}} : vector<2x32x16xbf16> to vector<1024xbf16>
           //CHECK: vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16>
@@ -144,7 +144,7 @@ module {
           %448 = vector.extract_strided_slice %442 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16>
           %449 = vector.extract_strided_slice %442 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16>
           %450 = vector.extract_strided_slice %442 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16>
-          %451 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>> -> vector<2x16x16x2xbf16>
+          %451 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xbf16>
 
           //CHECK: vector.shape_cast %{{.*}} : vector<2x16x16x2xbf16> to vector<1024xbf16>
           //CHECK: vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16>
@@ -164,15 +164,15 @@ module {
           %456 = vector.extract_strided_slice %453 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16>
           %457 = vector.extract_strided_slice %453 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16>
           xegpu.compile_hint
-          xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-          xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+          xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+          xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
           xegpu.compile_hint
-          %458 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-          %459 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+          %458 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+          %459 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
           xegpu.compile_hint
           xegpu.compile_hint
-          %460 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
-          %461 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>
+          %460 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+          %461 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
           xegpu.compile_hint
           %462 = xegpu.dpas %443, %454, %arg8 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           %463 = xegpu.dpas %447, %455, %462 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
@@ -191,7 +191,7 @@ module {
           %476 = xegpu.dpas %446, %456, %arg15 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           %477 = xegpu.dpas %450, %457, %476 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           xegpu.compile_hint
-          scf.yield %460, %461, %463, %465, %467, %469, %471, %473, %475, %477, %458, %459, %439 : !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true, scattered = false>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>, index
+          scf.yield %460, %461, %463, %465, %467, %469, %471, %473, %475, %477, %458, %459, %439 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index
         }
 
         //CHECK-COUNT-8: vector.shape_cast %{{.*}} : vector<8x16xf32> to vector<128xf32>
@@ -489,27 +489,27 @@ module {
         %alloc = memref.alloc() : memref<256x4xf32, #spirv.storage_class<Workgroup>>
         %359 = arith.addi %13, %c0 : index
         %360 = arith.addi %38, %c0 : index
-        %361 = xegpu.create_nd_tdesc %alloc[%359, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %361 = xegpu.create_nd_tdesc %alloc[%359, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %362 = arith.addi %13, %c8 : index
-        %363 = xegpu.create_nd_tdesc %alloc[%362, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %363 = xegpu.create_nd_tdesc %alloc[%362, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %c16 = arith.constant 16 : index
         %364 = arith.addi %13, %c16 : index
-        %365 = xegpu.create_nd_tdesc %alloc[%364, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %365 = xegpu.create_nd_tdesc %alloc[%364, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %c24 = arith.constant 24 : index
         %366 = arith.addi %13, %c24 : index
-        %367 = xegpu.create_nd_tdesc %alloc[%366, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        %367 = xegpu.create_nd_tdesc %alloc[%366, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         %368 = vector.extract_strided_slice %358 {offsets = [0, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32>
         %369 = vector.extract_strided_slice %358 {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32>
         %370 = vector.extract_strided_slice %358 {offsets = [16, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32>
         %371 = vector.extract_strided_slice %358 {offsets = [24, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32>
-        xegpu.store_nd %368, %361 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.store_nd %369, %363 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.store_nd %370, %365 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        xegpu.store_nd %371, %367 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+        xegpu.store_nd %368, %361 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %369, %363 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %370, %365 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %371, %367 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
         gpu.barrier
         %372 = arith.addi %9, %c0 : index
-        %373 = xegpu.create_nd_tdesc %alloc[%372, %c0] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-        %374 = xegpu.load_nd %373 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>> -> vector<8x4xf32>
+        %373 = xegpu.create_nd_tdesc %alloc[%372, %c0] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %374 = xegpu.load_nd %373 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x4xf32>
         %375 = vector.extract_strided_slice %374 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32>
         %376 = vector.extract_strided_slice %374 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32>
         %377 = vector.extract_strided_slice %374 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32>
@@ -591,8 +591,8 @@ module {
       %41 = arith.addi %40, %9 : index
       %42 = arith.addi %41, %c0 : index
       %43 = arith.addi %7, %c0 : index
-      %44 = xegpu.create_nd_tdesc %arg2[%42, %43] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
-      xegpu.store_nd %39, %44 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true, scattered = false>>
+      %44 = xegpu.create_nd_tdesc %arg2[%42, %43] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      xegpu.store_nd %39, %44 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
       gpu.return
     }
   }