From 0d068255e3a96474850119ef729b5c68110da9e4 Mon Sep 17 00:00:00 2001
From: Amy Zhuang <amy.zhuang@intel.com>
Date: Thu, 21 Nov 2024 03:29:25 +0200
Subject: [PATCH 1/3] Support index element type in VectorLinearize pass

Clone upstream patterns for constant op and vectorizable op and
remove isLessThanTargetBitWidth check to support index element type.

Remove IndexCastOp pattern from XeGPUToVC pass.
---
 lib/Conversion/XeGPUToVC/XeGPUToVC.cpp |  27 ------
 lib/Transforms/VectorLinearize.cpp     | 123 ++++++++++++++++---------
 test/Transforms/vector-linearize.mlir  |  92 ++++++++++++++++++
 3 files changed, 174 insertions(+), 68 deletions(-)
diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
index 9e080849d..8fa5b779e 100644
--- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
+++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
@@ -661,22 +661,6 @@ class VectorShapeCastPattern : public OpConversionPattern<ShapeCastOp> {
   }
 };
 
-template <typename OpTy>
-class IndexCastPattern : public OpConversionPattern<OpTy> {
-public:
-  using OpConversionPattern<OpTy>::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(OpTy indexCastOp, typename OpTy::Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto *converter = OpConversionPattern<OpTy>::getTypeConverter();
-    Type dstType = converter->convertType(indexCastOp.getType());
-    if (!dstType)
-      return failure();
-    rewriter.replaceOpWithNewOp<OpTy>(indexCastOp, dstType, adaptor.getIn());
-    return success();
-  }
-};
-
 class SCFForPattern : public OpConversionPattern<ForOp> {
 public:
   using OpConversionPattern<ForOp>::OpConversionPattern;
@@ -823,14 +807,6 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
     target.addDynamicallyLegalDialect<scf::SCFDialect>(
         [&](Operation *op) { return isLegalXeGPUSCFOp(op, typeConverter); });
 
-    target.addDynamicallyLegalOp<arith::IndexCastOp, arith::IndexCastUIOp>(
-        [&](Operation *op) {
-          if (auto vecTy = dyn_cast<VectorType>(op->getResult(0).getType())) {
-            return typeConverter.isLegal(vecTy);
-          }
-          return true;
-        });
-
     target.addIllegalOp<ShapeCastOp>();
 
     // TODO: can we change it to addDynamicLegalOp?
@@ -883,9 +859,6 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
     patterns.add<VectorShapeCastPattern, SCFForPattern>(typeConverter,
                                                         patterns.getContext());
 
-    patterns.add<IndexCastPattern<arith::IndexCastOp>,
-                 IndexCastPattern<arith::IndexCastUIOp>>(typeConverter,
-                                                         patterns.getContext());
     // Ops to llvm.genx only Patterns
     patterns.add<NbarrierWaitPattern, CompilerHintPattern, DpasPattern,
                  NbarrierArrivePattern>(patterns.getContext());
diff --git a/lib/Transforms/VectorLinearize.cpp b/lib/Transforms/VectorLinearize.cpp
index bf357b92c..1040db35e 100644
--- a/lib/Transforms/VectorLinearize.cpp
+++ b/lib/Transforms/VectorLinearize.cpp
@@ -34,33 +34,93 @@ namespace imex {
 
 namespace {
 
-// rewrite arith.constant op in form of vector<1xmxindex> into 1D form
-// (vector<mxindex>)
-struct ArithConstantOpConversion final
+// Cloned from upstream with isLessThanTargetBitWidth check removed.
+struct ConstantOpConversion final
     : public mlir::OpConversionPattern<mlir::arith::ConstantOp> {
-  using mlir::OpConversionPattern<mlir::arith::ConstantOp>::OpConversionPattern;
+  using OpConversionPattern::OpConversionPattern;
   mlir::LogicalResult
   matchAndRewrite(mlir::arith::ConstantOp constOp, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
-    auto value = llvm::dyn_cast<mlir::DenseElementsAttr>(constOp.getValue());
-    if (!value || value.getType().getRank() != 2)
-      return mlir::failure();
-    auto type = value.getType();
-    auto shape = type.getShape();
-    auto elemTy = type.getElementType();
-    if (shape[0] != 1 || !elemTy.isIndex())
+    auto resType =
+        getTypeConverter()->convertType<mlir::VectorType>(constOp.getType());
+
+    if (resType.isScalable() &&
+        !mlir::isa<mlir::SplatElementsAttr>(constOp.getValue()))
+      return rewriter.notifyMatchFailure(
+          constOp,
+          "Cannot linearize a constant scalable vector that's not a splat");
+
+    if (!resType)
+      return rewriter.notifyMatchFailure(constOp, "can't convert return type");
+    auto dstElementsAttr =
+        mlir::dyn_cast<mlir::DenseElementsAttr>(constOp.getValue());
+    if (!dstElementsAttr)
+      return rewriter.notifyMatchFailure(constOp, "unsupported attr type");
+
+    dstElementsAttr = dstElementsAttr.reshape(resType);
+    rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(constOp, resType,
+                                                         dstElementsAttr);
+    return mlir::success();
+  }
+};
+
+// Cloned from upstream with isLessThanTargetBitWidth check removed.
+struct VectorizableOpConversion final
+    : public mlir::OpTraitConversionPattern<mlir::OpTrait::Vectorizable> {
+  using OpTraitConversionPattern::OpTraitConversionPattern;
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op, llvm::ArrayRef<mlir::Value> operands,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::FailureOr<mlir::Operation *> newOp =
+        convertOpResultTypes(op, operands, *getTypeConverter(), rewriter);
+    if (failed(newOp))
       return mlir::failure();
-    auto newTy = mlir::VectorType::get({shape[1]}, elemTy);
-    value = value.reshape(newTy);
-    auto newOp =
-        rewriter.create<mlir::arith::ConstantOp>(constOp.getLoc(), value);
-    auto castOp = rewriter.create<mlir::vector::ShapeCastOp>(constOp.getLoc(),
-                                                             type, newOp);
-    rewriter.replaceOp(constOp, castOp);
+
+    rewriter.replaceOp(op, (*newOp)->getResults());
     return mlir::success();
   }
 };
 
+// Cloned from upstream with isLessThanTargetBitWidth check removed.
+static void populateVectorLinearizeTypeConversionsAndLegality(
+    mlir::TypeConverter &typeConverter, mlir::RewritePatternSet &patterns,
+    mlir::ConversionTarget &target) {
+
+  typeConverter.addConversion(
+      [](mlir::VectorType type) -> std::optional<mlir::Type> {
+        if (!mlir::vector::isLinearizableVector(type))
+          return type;
+
+        return mlir::VectorType::get(type.getNumElements(),
+                                     type.getElementType(), type.isScalable());
+      });
+
+  auto materializeCast = [](mlir::OpBuilder &builder, mlir::Type type,
+                            mlir::ValueRange inputs,
+                            mlir::Location loc) -> mlir::Value {
+    if (inputs.size() != 1 ||
+        !mlir::isa<mlir::VectorType>(inputs.front().getType()) ||
+        !mlir::isa<mlir::VectorType>(type))
+      return nullptr;
+
+    return builder.create<mlir::vector::ShapeCastOp>(loc, type, inputs.front());
+  };
+  typeConverter.addArgumentMaterialization(materializeCast);
+  typeConverter.addSourceMaterialization(materializeCast);
+  typeConverter.addTargetMaterialization(materializeCast);
+  target.markUnknownOpDynamicallyLegal(
+      [=](mlir::Operation *op) -> std::optional<bool> {
+        if ((mlir::isa<mlir::arith::ConstantOp>(op) ||
+             op->hasTrait<mlir::OpTrait::Vectorizable>())) {
+          return typeConverter.isLegal(op);
+        }
+        return std::nullopt;
+      });
+
+  patterns.add<ConstantOpConversion, VectorizableOpConversion>(
+      typeConverter, patterns.getContext());
+}
+
 struct VectorLoadOpConversion final
     : public mlir::OpConversionPattern<mlir::vector::LoadOp> {
   using mlir::OpConversionPattern<mlir::vector::LoadOp>::OpConversionPattern;
@@ -513,38 +573,19 @@ struct VectorLinearizePass final
           return (op && op.getAggregate().getType().getRank() == 1);
         });
 
-    // borrowed from upstream with hacking for index type. Currently
-    // we only target vector<1xmxindex> to vector<mxindex> conversion. It is
-    // unclear whether others are valid or not; thus they are left untouched.
-    target.addDynamicallyLegalOp<mlir::arith::ConstantOp>(
-        [&](mlir::arith::ConstantOp op) -> bool {
-          auto vecTy = mlir::dyn_cast<mlir::VectorType>(op.getType());
-          if (!vecTy || vecTy.getRank() == 0)
-            return true;
-
-          auto elemTy = vecTy.getElementType();
-          if (elemTy.isIndex()) {
-            if (vecTy.getRank() == 2 && vecTy.getShape()[0] == 1)
-              return false;
-            return true;
-          }
-          return !mlir::vector::isLinearizableVector(vecTy);
-        });
-
     patterns.add<VectorExtractStridedSliceConversion, VectorShffleOpConversion,
                  VectorExtractOpConversion, VectorInsertOpConversion,
                  VectorSplatOpConversion, VectorLoadOpConversion,
-                 VectorStoreOpConversion, VectorCreateMaskOpConversion,
-                 ArithConstantOpConversion>(typeConverter, context);
+                 VectorStoreOpConversion, VectorCreateMaskOpConversion>(
+        typeConverter, context);
 
     // Shuffle16x16 will fallback to Shuffle1D for non 16x16 sizes.
     mlir::vector::populateVectorTransposeLoweringPatterns(
         patterns,
         mlir::vector::VectorTransformsOptions().setVectorTransposeLowering(
             mlir::vector::VectorTransposeLowering::Shuffle16x16));
-    unsigned targetVectBitWidth = std::numeric_limits<unsigned>::max();
-    mlir::vector::populateVectorLinearizeTypeConversionsAndLegality(
-        typeConverter, patterns, target, targetVectBitWidth);
+    populateVectorLinearizeTypeConversionsAndLegality(typeConverter, patterns,
+                                                      target);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns))))
       return signalPassFailure();
diff --git a/test/Transforms/vector-linearize.mlir b/test/Transforms/vector-linearize.mlir
index 17dfe5102..dcfa5217c 100644
--- a/test/Transforms/vector-linearize.mlir
+++ b/test/Transforms/vector-linearize.mlir
@@ -284,3 +284,95 @@ func.func @test_vector_store_load_4x4(%buffer: memref<4x4xf32>) {
   vector.store %0, %buffer[%c0, %c0] : memref<4x4xf32>, vector<4x4xf32>
   return
 }
+
+// -----
+// CHECK-LABEL: @test_linearize_index
+//  CHECK-SAME: (%[[ARG0:.*]]: vector<2x2xindex>, %[[ARG1:.*]]: vector<2x2xi32>) -> vector<2x2xindex> {
+//       CHECK: %[[T0:.*]] = vector.shape_cast %[[ARG1]] : vector<2x2xi32> to vector<4xi32>
+//       CHECK: %[[T1:.*]] = vector.shape_cast %[[ARG0]] : vector<2x2xindex> to vector<4xindex>
+//       CHECK: %[[CST:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+//       CHECK: %[[T2:.*]] = arith.addi %[[T1]], %[[CST]] : vector<4xindex>
+//       CHECK: %[[T3:.*]] = arith.index_cast %[[T2]] : vector<4xindex> to vector<4xi32>
+//       CHECK: %[[T4:.*]] = arith.muli %[[T3]], %[[T0]] : vector<4xi32>
+//       CHECK: %[[T5:.*]] = arith.index_cast %[[T4]] : vector<4xi32> to vector<4xindex>
+//       CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<4xindex> to vector<2x2xindex>
+//       CHECK: return %[[T6]] : vector<2x2xindex>
+func.func @test_linearize_index(%arg0: vector<2x2xindex>, %arg1: vector<2x2xi32>) -> vector<2x2xindex> {
+  %0 = arith.constant dense<[[0, 1], [2, 3]]> : vector<2x2xindex>
+// Arith and math ops are handled in generic way, check some of them
+  %1 = arith.addi %arg0, %0 :  vector<2x2xindex>
+  %2 = arith.index_cast %1 : vector<2x2xindex> to vector<2x2xi32>
+  %3 = arith.muli %2, %arg1 : vector<2x2xi32>
+  %4 = arith.index_cast %3 : vector<2x2xi32> to vector<2x2xindex>
+  return %4 : vector<2x2xindex>
+}
+
+// -----
+// CHECK-LABEL: @add_kernel_f32
+//       CHECK: %[[CST0:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
+//       CHECK: %[[CST1:.*]] = arith.constant dense<[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]> : vector<16xindex>
+//       CHECK: %[[T0:.*]] = vector.splat %{{.*}} : vector<16xindex>
+//       CHECK: %[[T1:.*]] = arith.addi %[[T0]], %[[CST0]] : vector<16xindex>
+//       CHECK: %[[T2:.*]] = arith.addi %[[T0]], %[[CST1]] : vector<16xindex>
+//       CHECK: %[[T3:.*]] = arith.index_cast %[[T1]] : vector<16xindex> to vector<16xi32>
+//       CHECK: %[[T4:.*]] = arith.index_cast %[[T2]] : vector<16xindex> to vector<16xi32>
+//       CHECK: %[[T5:.*]] = vector.splat %{{.*}} : vector<16xi32>
+//       CHECK: %[[T6:.*]] = arith.addi %[[T5]], %[[T3]] : vector<16xi32>
+//       CHECK: %[[T7:.*]] = arith.addi %[[T5]], %[[T4]] : vector<16xi32>
+//       CHECK: %[[T8:.*]] = arith.index_cast %[[T6]] : vector<16xi32> to vector<16xindex>
+//       CHECK: %[[T9:.*]] = arith.index_cast %[[T7]] : vector<16xi32> to vector<16xindex>
+gpu.module @add_kernel_f32 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+  gpu.func @add_kernel_f32(%arg0: memref<*xf32>, %arg1: memref<*xf32>, %arg2: memref<*xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 32, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    %cst = arith.constant dense<true> : vector<16xi1>
+    %c32 = arith.constant 32 : index
+    %c1024_i32 = arith.constant 1024 : i32
+    %cst_0 = arith.constant dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+    %cst_1 = arith.constant dense<[[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]]> : vector<1x16xindex>
+    %thread_id_x = gpu.thread_id  x
+    %thread_id_y = gpu.thread_id  y
+    %block_dim_y = gpu.block_dim  y
+    %0 = arith.muli %thread_id_x, %block_dim_y : index
+    %1 = arith.addi %0, %thread_id_y : index
+    %cast = memref.cast %arg0 : memref<*xf32> to memref<?xf32>
+    %cast_2 = memref.cast %arg1 : memref<*xf32> to memref<?xf32>
+    %cast_3 = memref.cast %arg2 : memref<*xf32> to memref<?xf32>
+    %2 = arith.remsi %1, %c32 : index
+    %3 = arith.muli %2, %c32 : index
+    %4 = vector.splat %3 : vector<1x16xindex>
+    %5 = arith.addi %4, %cst_0 : vector<1x16xindex>
+    %6 = arith.addi %4, %cst_1 : vector<1x16xindex>
+    %7 = arith.index_cast %5 : vector<1x16xindex> to vector<1x16xi32>
+    %8 = arith.index_cast %6 : vector<1x16xindex> to vector<1x16xi32>
+    %block_id_x = gpu.block_id  x
+    %9 = arith.index_cast %block_id_x : index to i32
+    %10 = arith.muli %9, %c1024_i32 : i32
+    %11 = vector.splat %10 : vector<1x16xi32>
+    %12 = arith.addi %11, %7 : vector<1x16xi32>
+    %13 = arith.addi %11, %8 : vector<1x16xi32>
+    %14 = arith.index_cast %12 : vector<1x16xi32> to vector<1x16xindex>
+    %15 = arith.index_cast %13 : vector<1x16xi32> to vector<1x16xindex>
+    %16 = vector.shape_cast %14 : vector<1x16xindex> to vector<16xindex>
+    %17 = xegpu.create_tdesc %cast, %16 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>
+    %18 = vector.shape_cast %15 : vector<1x16xindex> to vector<16xindex>
+    %19 = xegpu.create_tdesc %cast, %18 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>
+    %20 = xegpu.load %17, %cst <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>, vector<16xi1> -> vector<16xf32>
+    %21 = vector.shape_cast %20 : vector<16xf32> to vector<1x16xf32>
+    %22 = xegpu.load %19, %cst <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>, vector<16xi1> -> vector<16xf32>
+    %23 = vector.shape_cast %22 : vector<16xf32> to vector<1x16xf32>
+    %24 = xegpu.create_tdesc %cast_2, %16 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>
+    %25 = xegpu.create_tdesc %cast_2, %18 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>
+    %26 = xegpu.load %24, %cst <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>, vector<16xi1> -> vector<16xf32>
+    %27 = vector.shape_cast %26 : vector<16xf32> to vector<1x16xf32>
+    %28 = xegpu.load %25, %cst <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>, vector<16xi1> -> vector<16xf32>
+    %29 = vector.shape_cast %28 : vector<16xf32> to vector<1x16xf32>
+    %30 = arith.addf %21, %27 : vector<1x16xf32>
+    %31 = arith.addf %23, %29 : vector<1x16xf32>
+    %32 = xegpu.create_tdesc %cast_3, %16 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>
+    %33 = xegpu.create_tdesc %cast_3, %18 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>
+    %34 = vector.shape_cast %30 : vector<1x16xf32> to vector<16xf32>
+    xegpu.store %34, %32, %cst <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>, vector<16xi1>
+    %35 = vector.shape_cast %31 : vector<1x16xf32> to vector<16xf32>
+    xegpu.store %35, %33, %cst <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>, vector<16xi1>
+    gpu.return
+  }
+}
\ No newline at end of file

From 7188e910ee35938f06300e89266f2e875a901803 Mon Sep 17 00:00:00 2001
From: Amy Zhuang <amy.zhuang@intel.com>
Date: Thu, 21 Nov 2024 18:07:06 +0200
Subject: [PATCH 2/3] Delete nd-ops.mlir

---
 test/Conversion/XeGPUToVC/nd-ops.mlir | 36 ---------------------------
 1 file changed, 36 deletions(-)
 delete mode 100644 test/Conversion/XeGPUToVC/nd-ops.mlir

diff --git a/test/Conversion/XeGPUToVC/nd-ops.mlir b/test/Conversion/XeGPUToVC/nd-ops.mlir
deleted file mode 100644
index 010fb18b0..000000000
--- a/test/Conversion/XeGPUToVC/nd-ops.mlir
+++ /dev/null
@@ -1,36 +0,0 @@
-// Tests ops on nd vectors that should be linearized.
-
-// RUN: imex-opt -convert-xegpu-to-vc -split-input-file %s | FileCheck %s --check-prefixes=CHECK
-module @gemm attributes {gpu.container_module} {
-  gpu.module @test_kernel {
-
-    // CHECK-LABEL: gpu.func @test_index_cast
-    // CHECK: %[[c1024:.*]] = arith.constant 1024 : i32
-    // CHECK: %[[bid:.*]] = gpu.block_id x
-    // CHECK: %[[cst:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xi32>
-    // CHECK: %[[r0:.*]] = arith.index_cast %[[bid]] : index to i32
-    // CHECK: %[[r1:.*]] = arith.muli %[[r0]], %[[c1024]] : i32
-    // CHECK: %[[r2:.*]] = vector.splat %[[r1]] : vector<16xi32>
-    // CHECK: %[[r3:.*]] = arith.addi %[[r2]], %[[r2]] : vector<16xi32>
-    // CHECK: %[[r4:.*]] = arith.addi %[[r2]], %[[cst]] : vector<16xi32>
-    // CHECK: %[[r5:.*]] = arith.index_cast %[[r3]] : vector<16xi32> to vector<16xindex>
-    // CHECK: %[[r6:.*]] = arith.index_cast %[[r4]] : vector<16xi32> to vector<16xindex>
-    // CHECK-NEXT: gpu.return
-    gpu.func @test_index_cast() kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
-      %c1024_i32 = arith.constant 1024 : i32
-      %block_id_x = gpu.block_id  x
-      %cst_0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xi32>
-      %23 = arith.index_cast %block_id_x : index to i32
-      %24 = arith.muli %23, %c1024_i32 : i32
-      %25 = vector.splat %24 : vector<16xi32>
-      %26 = arith.addi %25, %25 : vector<16xi32>
-      %27 = vector.shape_cast %26 : vector<16xi32> to vector<1x16xi32>
-      %28 = arith.addi %25, %cst_0 : vector<16xi32>
-      %29 = vector.shape_cast %28 : vector<16xi32> to vector<1x16xi32>
-      %30 = arith.index_cast %27 : vector<1x16xi32> to vector<1x16xindex>
-      %31 = arith.index_cast %29 : vector<1x16xi32> to vector<1x16xindex>
-
-      gpu.return
-    }
-  }
-}

From 44efd077250017bf0fcffb5a0378be83c6fe2f02 Mon Sep 17 00:00:00 2001
From: Amy Zhuang <amy.zhuang@intel.com>
Date: Thu, 21 Nov 2024 18:40:58 +0200
Subject: [PATCH 3/3] Fix vector-linearize.mlir format

---
 test/Transforms/vector-linearize.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Transforms/vector-linearize.mlir b/test/Transforms/vector-linearize.mlir
index dcfa5217c..9d02d3ef3 100644
--- a/test/Transforms/vector-linearize.mlir
+++ b/test/Transforms/vector-linearize.mlir
@@ -375,4 +375,4 @@ gpu.module @add_kernel_f32 attributes {spirv.target_env = #spirv.target_env<#spi
     xegpu.store %35, %33, %cst <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space =  global, chunk_size = 1 : i64>>, vector<16xi1>
     gpu.return
   }
-}
\ No newline at end of file
+}