From 5f215895a96cca65519cbd6a528d5b7bd3ff5f19 Mon Sep 17 00:00:00 2001
From: Md Abdullah Shahneous Bari <md.abdullah.shahneous.bari@intel.com>
Date: Tue, 21 Nov 2023 14:46:41 -0800
Subject: [PATCH] [xegpu][spirv] Add xegpu.simt to spirv JoinMatrixINTEL
 lowering & and E2E XeGPU.SIMT GEMM test case

Supported op:

xegpu.create_nd_descriptor
xegpu.update_nd_offset
xegpu.load_nd
xegpu.store_nd
xegpu.dpas
Add an end-to-end GEMM test case for XeGPU.SIMT

GEMM parameters in the test case:
Matrix A = 1024x1024xf16
Matrix B = 1024x1024xf16
Matrix C = 1024x1024xf32
---
 include/imex/Conversion/Passes.td             |   6 +-
 .../Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h    |   3 +
 lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp  | 109 +++-
 lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp  | 510 ++++++++++++++++++
 test/Conversion/GPUToSPIRV/loadstore.mlir     |   2 +-
 test/Conversion/GPUToSPIRV/scf.mlir           |   2 +-
 .../XeGPUToSPIRV/atomic_basic.vc.mlir         |   2 +-
 .../XeGPUToSPIRV/barrier_basic.vc.mlir        |   2 +-
 test/Conversion/XeGPUToSPIRV/gemm_basic.mlir  |   2 +-
 .../XeGPUToSPIRV/gemm_basic.vc.mlir           |   4 +-
 .../XeGPUToSPIRV/gemm_basic_1d.vc.mlir        |   4 +-
 .../XeGPUToSPIRV/gemm_basic_gather.vc.mlir    |   2 +-
 test/Conversion/XeGPUToSPIRV/lit.local.cfg    |   3 +-
 .../XeGPUToSPIRV/update_offset.vc.mlir        |   2 +-
 test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir |   2 +-
 .../gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir | 139 +++++
 test/Integration/Dialect/XeGPU/lit.local.cfg  |   4 +-
 .../XeGPU/xegpu-to-llvm-joint-matrix.pp       |  25 +
 .../Dialect/XeGPU/xegpu-to-llvm.pp            |   2 +-
 test/SPIRV/IntelVectorExtension/lit.local.cfg |   3 +-
 20 files changed, 782 insertions(+), 46 deletions(-)
 create mode 100644 test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir
 create mode 100644 test/Integration/Dialect/XeGPU/xegpu-to-llvm-joint-matrix.pp

diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td
index 2766c91c5..36b4830f0 100644
--- a/include/imex/Conversion/Passes.td
+++ b/include/imex/Conversion/Passes.td
@@ -251,7 +251,11 @@ memref, arith and math.
   let constructor = "imex::createConvertGPUXToSPIRVPass()";
   let dependentDialects = ["::mlir::spirv::SPIRVDialect"];
   let options = [
-    Option<"enableVCIntrinsic", "enable-vc-intrinsic","bool", "true",
+    Option<"enableJointMatrix", "enable-joint-matrix","bool", "false",
+           "Enable XeGPU SIMT mode Ops lowered to JointMatrix based Ops">,
+    Option<"enableGenISAIntrinsic", "enable-genisa-intrinsic","bool", "false",
+           "Enable XeGPU SIMT mode Ops lowered to JointMatrix based Ops">,
+    Option<"enableVCIntrinsic", "enable-vc-intrinsic","bool", "false",
            "Enable XeGPU Ops lowered to intel vc Intrinsics">
   ];
 }
diff --git a/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h b/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h
index 91615dbad..5ecfc4778 100644
--- a/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h
+++ b/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h
@@ -30,6 +30,9 @@ void populateXeGPUToVCIntrinsicsPatterns(
 // XeGPU to genISA Intrinsics pattern
 void populateXeGPUToGenISAPatterns(mlir::SPIRVTypeConverter &typeConverter,
                                    mlir::RewritePatternSet &patterns);
+// XeGPU to JointMatrix pattern
+void populateXeGPUToJointMatrixPatterns(mlir::SPIRVTypeConverter &typeConverter,
+                                        mlir::RewritePatternSet &patterns);
 } // namespace imex
 
 #endif // IMEX_CONVERSION_XEGPUTOSPIRV_H
diff --git a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index 589b3b033..17742bba5 100644
--- a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -156,35 +156,79 @@ void GPUXToSPIRVPass::runOnOperation() {
       eraseOp->erase();
     }
     target->addIllegalDialect<imex::xegpu::XeGPUDialect>();
-    typeConverter.addConversion([&](xegpu::NbarrierType type) -> ::mlir::Type {
-      auto i32Type = ::mlir::IntegerType::get(context, 32);
-      return mlir::VectorType::get(8, i32Type);
-    });
-    typeConverter.addConversion(
-        [&](xegpu::TensorDescType type) -> ::mlir::Type {
-          auto i32Type = ::mlir::IntegerType::get(context, 32);
-          return ::mlir::VectorType::get(8, i32Type);
-        });
-    typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type {
-      unsigned rank = type.getRank();
-      auto elemType = type.getElementType();
-      if (rank < 1)
-        return type;
-      else {
-        // load2d/store2d is vnni format with 3 dims
-        if (rank == 3 && elemType.getIntOrFloatBitWidth() < 32) {
-          elemType = ::mlir::IntegerType::get(context, 32);
-          rank--;
+    // Only one of the following options should be enabled.
+    if ((this->enableVCIntrinsic && this->enableGenISAIntrinsic) ||
+        (this->enableVCIntrinsic && this->enableJointMatrix) ||
+        (this->enableGenISAIntrinsic && this->enableJointMatrix))
+      return signalPassFailure();
+    if (this->enableJointMatrix) {
+      // Tensor descriptor conversion pattern for SIMT JointMatrix
+      typeConverter.addConversion(
+          [&](xegpu::TensorDescType type) -> ::mlir::spirv::StructType {
+            llvm::SmallVector<::mlir::Type, 4> memberTypes;
+            auto i64Type = ::mlir::IntegerType::get(context, 64);
+            // Default storage class is spirv::StorageClass::CrossWorkgroup
+            auto spirvStorageClass =
+                ::mlir::spirv::StorageClass::CrossWorkgroup;
+            if (type.getMemoryScope() == xegpu::MemoryScope::SLM)
+              spirvStorageClass = ::mlir::spirv::StorageClass::Workgroup;
+            auto baseAddressType = ::mlir::spirv::PointerType::get(
+                type.getElementType(), spirvStorageClass);
+            memberTypes.push_back(baseAddressType);
+            memberTypes.push_back(i64Type);
+
+            for (int i = 0; i < type.getRank(); i++) {
+              memberTypes.push_back(i64Type);
+            }
+            return ::mlir::spirv::StructType::get(memberTypes);
+          });
+      typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type {
+        unsigned rank = type.getRank();
+        auto elemType = type.getElementType();
+        if (rank < 1)
+          return type;
+        else {
+          unsigned sum = 1;
+          for (unsigned i = 0; i < rank; i++) {
+            sum *= type.getShape()[i];
+          }
+          if (llvm::isa<mlir::IndexType>(elemType))
+            elemType = ::mlir::IntegerType::get(context, 64);
+          return ::mlir::VectorType::get(sum, elemType);
         }
-        unsigned sum = 1;
-        for (unsigned i = 0; i < rank; i++) {
-          sum *= type.getShape()[i];
+      });
+    } else {
+      typeConverter.addConversion(
+          [&](xegpu::TensorDescType type) -> ::mlir::Type {
+            auto i32Type = ::mlir::IntegerType::get(context, 32);
+            return ::mlir::VectorType::get(8, i32Type);
+          });
+      typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type {
+        unsigned rank = type.getRank();
+        auto elemType = type.getElementType();
+        if (rank < 1)
+          return type;
+        else {
+          // load2d/store2d is vnni format with 3 dims
+          if (rank == 3 && elemType.getIntOrFloatBitWidth() < 32) {
+            elemType = ::mlir::IntegerType::get(context, 32);
+            rank--;
+          }
+          unsigned sum = 1;
+          for (unsigned i = 0; i < rank; i++) {
+            sum *= type.getShape()[i];
+          }
+          if (llvm::isa<mlir::IndexType>(elemType))
+            elemType = ::mlir::IntegerType::get(context, 64);
+          return ::mlir::VectorType::get(sum, elemType);
         }
-        if (llvm::isa<mlir::IndexType>(elemType))
-          elemType = ::mlir::IntegerType::get(context, 64);
-        return ::mlir::VectorType::get(sum, elemType);
-      }
-    });
+      });
+      typeConverter.addConversion(
+          [&](xegpu::NbarrierType type) -> ::mlir::Type {
+            auto i32Type = ::mlir::IntegerType::get(context, 32);
+            return mlir::VectorType::get(8, i32Type);
+          });
+    }
 
     //------- Upstream Conversion------------
     mlir::populateGPUToSPIRVPatterns(typeConverter, patterns);
@@ -200,9 +244,16 @@ void GPUXToSPIRVPass::runOnOperation() {
     mlir::populateMathToSPIRVPatterns(typeConverter, patterns);
     if (this->enableVCIntrinsic)
       imex::populateXeGPUToVCIntrinsicsPatterns(typeConverter, patterns);
-    else
+    else if (this->enableJointMatrix)
+      imex::populateXeGPUToJointMatrixPatterns(typeConverter, patterns);
+    else if (this->enableGenISAIntrinsic)
       imex::populateXeGPUToGenISAPatterns(typeConverter, patterns);
-
+    else
+      module.emitOpError(
+          "'-imex-convert-gpu-to-spirv' pass must be run with one of the "
+          "following options to be 'true': "
+          "'enable-vc-intrinsic', 'enable-joint-matrix', "
+          "'enable-genisa-intrinsic'");
     if (failed(applyFullConversion(gpuModule, *target, std::move(patterns))))
       return signalPassFailure();
   }
diff --git a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
index c803c3b92..b4e86fcc8 100644
--- a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
+++ b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
@@ -1438,3 +1438,513 @@ void imex::populateXeGPUToGenISAPatterns(SPIRVTypeConverter &typeConverter,
                LoadStorePrefetchNdToGenISA<PrefetchNDOp>>(
       typeConverter, patterns.getContext());
 }
+
+namespace {
+// PVC-specific subgroup size for JointMatrix
+constexpr uint64_t jointMatrixSubGroupSize = 16;
+// Calculate flattened offsets
+// Calculate flattened offsets based on dims and offsets(indices)
+Value linearizeOffset(OpBuilder builder, Location loc,
+                      SmallVectorImpl<Value> &offsets,
+                      SmallVectorImpl<Value> &dims) {
+  assert(offsets.size() == dims.size() &&
+         "number of offsets & dimensions must be same");
+  auto createIntConstant = [&](Type type, unsigned value) {
+    auto attr = builder.getIntegerAttr(type, value);
+    return builder.create<spirv::ConstantOp>(loc, type, attr);
+  };
+
+  auto i64Type = builder.getI64Type();
+  auto rank = dims.size();
+  Value linearizedOffset = createIntConstant(i64Type, 0);
+  for (unsigned i = 0; i < rank; i++) {
+    Value perDimstrideMultiplier = createIntConstant(i64Type, 1);
+    for (unsigned j = i + 1; j < rank; j++) {
+      perDimstrideMultiplier = builder.create<spirv::IMulOp>(
+          loc, i64Type, perDimstrideMultiplier, dims[j]);
+    }
+    perDimstrideMultiplier = builder.create<spirv::IMulOp>(
+        loc, i64Type, perDimstrideMultiplier, offsets[i]);
+
+    linearizedOffset = builder.create<spirv::IAddOp>(
+        loc, i64Type, linearizedOffset, perDimstrideMultiplier);
+  }
+  return linearizedOffset;
+}
+
+unsigned getElementPerWI(imex::xegpu::TensorDescType tDescType) {
+  imex::xegpu::SubGroupMapAttr sgMap;
+  auto encoding = tDescType.getEncoding();
+  if (auto xeMapAttr = llvm::dyn_cast<imex::xegpu::XeMapAttr>(encoding)) {
+    sgMap = xeMapAttr.getSg();
+  } else {
+    sgMap = llvm::dyn_cast<imex::xegpu::SubGroupMapAttr>(encoding);
+  }
+  auto blockSize = tDescType.getShape();
+  auto wiLayout = sgMap.getWiLayout();
+  auto wiData = sgMap.getWiData();
+  unsigned elemPerWI = 1;
+  for (size_t i = 0; i < wiData.size(); i++) {
+    if (wiData[i] != 1)
+      llvm_unreachable("wi_data must be 1 for all dimension for "
+                       "JointMatrix lowering");
+    elemPerWI *= (blockSize[i] / wiLayout[i]);
+  }
+  return elemPerWI;
+}
+
+class CreateNdDescToJointMatrix : public OpConversionPattern<CreateNdDescOp> {
+public:
+  using OpConversionPattern<CreateNdDescOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(CreateNdDescOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    assert(op.getBoundaryCheck() == false &&
+           "for xegpu to joint matrix lowering boundary_check attribute must "
+           "be false");
+    auto loc = op.getLoc();
+    auto tileType = op.getTensorDesc().getType();
+    auto rank = tileType.getRank();
+
+    // Set the SPIR-V Struct to represent the Tensor Descriptor
+    // The create_nd_tdesc returns a spirv.struct
+    // The elements in the struct contains the following elements
+    // element 0 = base address pointer : spirv.ptr
+    // element 1 = 1D offset : i64
+    // element 2 = X Dim Size : i64
+    // element 3 = Y Dim Size : i64
+    // [SPIR-V lowering uses 1D flattened addresses passed as kernel parameters]
+    SmallVector<Type, 4> memberTypes;
+    auto i64Type = rewriter.getI64Type();
+    // Default storage class is spirv::StorageClass::CrossWorkgroup
+    auto spirvStorageClass = spirv::StorageClass::CrossWorkgroup;
+    // For memref use memref spirv storage attribute if available
+    auto srcType = op.getSourceType();
+    if (llvm::isa<mlir::MemRefType>(srcType)) {
+      auto sc = dyn_cast_or_null<spirv::StorageClassAttr>(
+          llvm::cast<mlir::MemRefType>(srcType).getMemorySpace());
+      if (sc)
+        spirvStorageClass = sc.getValue();
+    }
+    auto spirvBaseAddressType =
+        spirv::PointerType::get(op.getSourceElementType(), spirvStorageClass);
+
+    memberTypes.push_back(spirvBaseAddressType);
+    memberTypes.push_back(i64Type);
+    // For nD descriptor, dimesion=rank, so we need dimSize for all the
+    // dimensions
+    for (int i = 0; i < rank; i++) {
+      memberTypes.push_back(i64Type);
+    }
+
+    auto ndDescStruct = spirv::StructType::get(memberTypes);
+
+    Value payLoad = rewriter.create<spirv::UndefOp>(loc, ndDescStruct);
+    auto createIntConstant = [&](Type type, unsigned value) {
+      auto attr = rewriter.getIntegerAttr(type, value);
+      return rewriter.create<spirv::ConstantOp>(loc, type, attr);
+    };
+
+    // Insert the base address to the ndDescStruct struct
+    Value genericBasePtr;
+    // If the base type is memref, add a bitcast op
+    // If the base type is not memref type, add a ConvertUToPtr op
+    if (llvm::isa<mlir::MemRefType>(srcType)) {
+      genericBasePtr = rewriter.create<spirv::BitcastOp>(
+          loc, spirvBaseAddressType, adaptor.getSource());
+    } else {
+      genericBasePtr = rewriter.create<spirv::ConvertUToPtrOp>(
+          loc, spirvBaseAddressType, adaptor.getSource());
+    }
+
+    payLoad = rewriter.create<spirv::CompositeInsertOp>(
+        loc, genericBasePtr, payLoad, llvm::ArrayRef(0));
+
+    // TODO: We should be able to use op.getOffsets() directly with index cast
+    //  But we need support from XeGPU dialect definition to return i64_t
+
+    auto createOffset = [&](unsigned idx) -> Value {
+      Value val;
+      if (ShapedType::isDynamic(op.getStaticOffsets()[idx])) {
+        val = op.getOffsets()[idx];
+        // Cast index type to i64
+        val = rewriter.create<arith::IndexCastOp>(loc, i64Type, val);
+      } else {
+        val = createIntConstant(i64Type, op.getStaticOffsets()[idx]);
+      }
+      return val;
+    };
+
+    // TODO: We should be able to use op.getShape() directly with index cast
+    // But we need support from XeGPU dialect definition to return i64_t
+
+    auto createShape = [&](unsigned idx) -> Value {
+      Value val;
+      if (ShapedType::isDynamic(op.getStaticShape()[idx])) {
+        val = op.getShape()[idx];
+        // Cast index type to i64
+        val = rewriter.create<arith::IndexCastOp>(loc, i64Type, val);
+      } else {
+        val = createIntConstant(i64Type, op.getStaticShape()[idx]);
+      }
+      return val;
+    };
+
+    SmallVector<Value, 4> nDOffsets;
+    SmallVector<Value, 4> nDDims;
+    for (unsigned i = 0; i < rank; i++) {
+      nDOffsets.push_back(createOffset(i));
+    }
+
+    for (unsigned i = 0; i < rank; i++) {
+      nDDims.push_back(createShape(i));
+    }
+
+    // Calculate the 1-D offset, since the memrefs are flattened when
+    // passed to SPIR-V
+    Value linearizedOffset = linearizeOffset(rewriter, loc, nDOffsets, nDDims);
+    // Insert the flattened (1D) offset to the ndDescStruct struct
+
+    payLoad = rewriter.create<spirv::CompositeInsertOp>(
+        loc, linearizedOffset, payLoad, llvm::ArrayRef(1));
+    for (int i = 0; i < rank; i++) {
+      payLoad = rewriter.create<spirv::CompositeInsertOp>(loc, nDDims[i],
+                                                          payLoad, (i + 2));
+    }
+    rewriter.replaceOp(op, payLoad);
+    return success();
+  }
+};
+
+class UpdateNDOffsetJointMatrix : public OpConversionPattern<UpdateNDOffsetOp> {
+public:
+  using OpConversionPattern<UpdateNDOffsetOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(UpdateNDOffsetOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto desc = adaptor.getTensorDesc();
+    const int dimStartIdx = 2;
+    auto i64Type = rewriter.getI64Type();
+    auto createIntConstant = [&](Type type, unsigned value) {
+      auto attr = rewriter.getIntegerAttr(type, value);
+      return rewriter.create<spirv::ConstantOp>(loc, type, attr);
+    };
+    // Calculate the 1-D offset, since the memrefs are flattened when
+    // passed to SPIR-V
+    Value offset1D;
+    offset1D = createIntConstant(i64Type, 0);
+    auto offsets = adaptor.getOffsets();
+    auto rank = op.getTensorDesc().getType().getRank();
+    // number of offsets & tensorDescriptor rank must be same
+    assert(offsets.size() == (size_t)op.getTensorDesc().getType().getRank() &&
+           "number of offsets & tensorDescriptor rank must be same");
+    for (unsigned i = 0; i < rank; i++) {
+      Value perDimstrideMultiplier;
+      perDimstrideMultiplier = createIntConstant(i64Type, 1);
+      for (unsigned j = i + 1; j < rank; j++) {
+        Value dimSize = rewriter.create<spirv::CompositeExtractOp>(
+            loc, desc, (j + dimStartIdx));
+        perDimstrideMultiplier = rewriter.create<spirv::IMulOp>(
+            loc, i64Type, perDimstrideMultiplier, dimSize);
+      }
+      // Cast index type to i64
+      Value offsetVal =
+          rewriter.create<arith::IndexCastOp>(loc, i64Type, offsets[i]);
+      perDimstrideMultiplier = rewriter.create<spirv::IMulOp>(
+          loc, i64Type, perDimstrideMultiplier, offsetVal);
+
+      offset1D = rewriter.create<spirv::IAddOp>(loc, i64Type, offset1D,
+                                                perDimstrideMultiplier);
+    }
+
+    // Add the newOffset to previous offset
+    Value prev1DOffset = rewriter.create<spirv::CompositeExtractOp>(
+        loc, desc, llvm::ArrayRef(1));
+    offset1D =
+        rewriter.create<spirv::IAddOp>(loc, i64Type, offset1D, prev1DOffset);
+    // Update the descriptor with the new offset
+    desc = rewriter.create<spirv::CompositeInsertOp>(loc, offset1D, desc,
+                                                     llvm::ArrayRef(1));
+    rewriter.replaceOp(op, desc);
+    return success();
+  }
+};
+
+class LoadNDJointMatrix : public OpConversionPattern<LoadNDOp> {
+public:
+  using OpConversionPattern<LoadNDOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(LoadNDOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (op.getTranspose())
+      op.emitError("transpose is not currently supported for XeGPU to "
+                   "JointMatrix lowering");
+    auto loc = op.getLoc();
+    auto tDesc = adaptor.getTensorDesc();
+    auto tDescType = op.getTensorDesc().getType();
+    int rank = tDescType.getRank();
+    assert(rank == 2 && "only support 2d load for now");
+
+    // Get the base address
+    Value baseAddress = rewriter.create<spirv::CompositeExtractOp>(
+        loc, tDesc, llvm::ArrayRef(0));
+    // Get the offset
+    Value offset = rewriter.create<spirv::CompositeExtractOp>(
+        loc, tDesc, llvm::ArrayRef(1));
+
+    SmallVector<Value, 2> linearizedIndices;
+    // Get the load address
+    Value loadAddress = rewriter.create<spirv::InBoundsPtrAccessChainOp>(
+        loc, baseAddress, offset, linearizedIndices);
+
+    // Stride for jointMatrixLoad = Y Dim size
+    // TODO: what do we do for transpose case?
+    Value stride = rewriter.create<spirv::CompositeExtractOp>(
+        loc, tDesc, llvm::ArrayRef(3));
+
+    // Figure out the Matrix Use type (MatrixA, MatrixB, Accumulator)
+    uint32_t matrixUse;
+    // Don't expect vnni axis to be set for the Accumulator
+
+    if (auto vnniAxis = adaptor.getVnniAxis())
+      // vnniAxis 0 -> MatrixB -> matrixUse = 1
+      // vnniAxis 1 -> MatrixA -> matrixUse = 0
+      matrixUse = (*vnniAxis + 1) % 2;
+    else
+      // vnniAxis empty -> Accumulator -> matrixUse = 2
+      matrixUse = 2;
+
+    // TODO: Need to discuss how to handle transpose, load then transpose or
+    // transposed load?
+    auto jointMatrixtype = spirv::JointMatrixINTELType::get(
+        tDescType.getElementType(), spirv::Scope::Subgroup,
+        tDescType.getDimSize(0), tDescType.getDimSize(1),
+        spirv::MatrixLayout::RowMajor, *spirv::symbolizeMatrixUse(matrixUse));
+
+    auto jointMatrixLoaded = rewriter.create<spirv::INTELJointMatrixLoadOp>(
+        loc, jointMatrixtype, loadAddress, stride,
+        ::mlir::spirv::MatrixLayout::RowMajor, ::mlir::spirv::Scope::Subgroup,
+        nullptr, nullptr);
+
+    // TODO: Once architecture-spcific info are in place, add subgroup_size
+    // restriction verification
+    unsigned elemPerWI = getElementPerWI(tDescType);
+    auto elemType = tDescType.getElementType();
+    auto perWIVectorType = VectorType::get(elemPerWI, elemType);
+    Value payLoad = rewriter.create<spirv::UndefOp>(loc, perWIVectorType);
+    llvm::SmallVector<Value, 8> extractedVal;
+    for (unsigned i = 0; i < elemPerWI; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      extractedVal.push_back(rewriter.create<spirv::VectorExtractDynamicOp>(
+          loc, jointMatrixLoaded, idx));
+    }
+
+    // Putting all the extract and insert operations together, may make it
+    // easier for compiler (IGC) to reason about
+    for (unsigned i = 0; i < elemPerWI; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      payLoad = rewriter.create<spirv::VectorInsertDynamicOp>(
+          loc, payLoad, extractedVal[i], idx);
+    }
+    rewriter.replaceOp(op, payLoad);
+    return success();
+  }
+};
+
+class StoreNDJointMatrix : public OpConversionPattern<StoreNDOp> {
+public:
+  using OpConversionPattern<StoreNDOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(StoreNDOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto tDesc = adaptor.getTensorDesc();
+    auto tDescType = op.getTensorDesc().getType();
+    int rank = tDescType.getRank();
+    assert(rank == 2 && "only support 2d load for now");
+
+    // Get the base address
+    Value baseAddress = rewriter.create<spirv::CompositeExtractOp>(
+        loc, tDesc, llvm::ArrayRef(0));
+    // Get the offset
+    Value offset = rewriter.create<spirv::CompositeExtractOp>(
+        loc, tDesc, llvm::ArrayRef(1));
+
+    SmallVector<Value, 2> linearizedIndices;
+    // Get the load address
+    Value loadAddress = rewriter.create<spirv::InBoundsPtrAccessChainOp>(
+        loc, baseAddress, offset, linearizedIndices);
+
+    // Stride for jointMatrixLoad = Y Dim size
+    // TODO: what do we do for transpose case?
+    Value stride = rewriter.create<spirv::CompositeExtractOp>(
+        loc, tDesc, llvm::ArrayRef(3));
+
+    // For Store, we only allow Accumulator type matrix to store.
+    // TODO: We need to Add option on the xegpu.store_nd to support storing B
+    // matrix for that we need to add vnni_axis attribute to store_nd op as
+    // well.
+    uint32_t matrixUse = 2;
+    // Don't expect vnni axis to be set for the Accumulator
+    auto jointMatrixtype = spirv::JointMatrixINTELType::get(
+        tDescType.getElementType(), spirv::Scope::Subgroup,
+        tDescType.getDimSize(0), tDescType.getDimSize(1),
+        spirv::MatrixLayout::RowMajor, *spirv::symbolizeMatrixUse(matrixUse));
+    Value matrix = rewriter.create<spirv::UndefOp>(loc, jointMatrixtype);
+
+    // TODO: Once architecture-spcific info are in place, add subgroup_size
+    // restriction verification
+    unsigned elemPerWI = getElementPerWI(tDescType);
+    // auto elemType = tDescType.getElementType();
+    // Get the 2D vector
+    auto perWIVector = adaptor.getValue();
+    llvm::SmallVector<Value, 8> extractedVal;
+    for (unsigned i = 0; i < elemPerWI; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      extractedVal.push_back(rewriter.create<spirv::VectorExtractDynamicOp>(
+          loc, perWIVector, idx));
+    }
+
+    // Putting all the extract and insert operations together, may make it
+    // easier for compiler (IGC) to reason about
+    for (unsigned i = 0; i < elemPerWI; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      matrix = rewriter.create<spirv::VectorInsertDynamicOp>(
+          loc, matrix, extractedVal[i], idx);
+    }
+    auto payLoad = rewriter.create<spirv::INTELJointMatrixStoreOp>(
+        loc, loadAddress, matrix, stride, ::mlir::spirv::MatrixLayout::RowMajor,
+        ::mlir::spirv::Scope::Subgroup, nullptr, nullptr);
+    rewriter.replaceOp(op, payLoad);
+    return success();
+  }
+};
+
+class DpasJointMatrix : public OpConversionPattern<DpasOp> {
+public:
+  using OpConversionPattern<DpasOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(DpasOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto vectorA = op.getLhs();
+    auto vectorB = op.getRhs();
+    auto vectorC = op.getAcc();
+    {
+      OpBuilder::InsertionGuard guard(rewriter);
+      auto func = op->getParentOfType<spirv::FuncOp>();
+      rewriter.setInsertionPointAfter(func);
+      rewriter.create<spirv::ExecutionModeOp>(
+          op.getLoc(), func, spirv::ExecutionMode::SubgroupSize,
+          int(jointMatrixSubGroupSize));
+    }
+    // Matrix row = 1st dim of input vector
+    // Matrix colomn = 2nd dim of input vector * jointMatrixSubGroupSize
+    auto matrixAType = spirv::JointMatrixINTELType::get(
+        vectorA.getType().getElementType(), spirv::Scope::Subgroup,
+        vectorA.getType().getShape()[0],
+        vectorA.getType().getShape()[1] * jointMatrixSubGroupSize,
+        spirv::MatrixLayout::RowMajor, spirv::MatrixUse::MatrixA);
+
+    // B matrix vector is passed VNNI-transformed, so row = dim0 *dim3
+    auto matrixBType = spirv::JointMatrixINTELType::get(
+        vectorB.getType().getElementType(), spirv::Scope::Subgroup,
+        vectorB.getType().getShape()[0] * vectorB.getType().getShape()[2],
+        vectorB.getType().getShape()[1] * jointMatrixSubGroupSize,
+        spirv::MatrixLayout::RowMajor, spirv::MatrixUse::MatrixB);
+
+    auto matrixCType = spirv::JointMatrixINTELType::get(
+        vectorC.getType().getElementType(), spirv::Scope::Subgroup,
+        vectorC.getType().getShape()[0],
+        vectorC.getType().getShape()[1] * jointMatrixSubGroupSize,
+        spirv::MatrixLayout::RowMajor, spirv::MatrixUse::Accumulator);
+
+    Value matrixA = rewriter.create<spirv::UndefOp>(loc, matrixAType);
+    Value matrixB = rewriter.create<spirv::UndefOp>(loc, matrixBType);
+    Value matrixC = rewriter.create<spirv::UndefOp>(loc, matrixCType);
+    // Create Matrices from the vectors
+    // Get the flattened vectors through the adaptor, since SPIRV only allows 1D
+    // vector
+    auto perWIVectorA = adaptor.getLhs();
+    auto perWIVectorB = adaptor.getRhs();
+    auto perWIVectorC = adaptor.getAcc();
+
+    llvm::SmallVector<Value, 8> extractedValA;
+    auto perWIelemsA =
+        llvm::cast<mlir::VectorType>(perWIVectorA.getType()).getNumElements();
+    for (unsigned i = 0; i < perWIelemsA; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      extractedValA.push_back(rewriter.create<spirv::VectorExtractDynamicOp>(
+          loc, perWIVectorA, idx));
+    }
+    // Putting all the extract and insert operations together, may make it
+    // easier for compiler (IGC) to reason about
+    for (unsigned i = 0; i < perWIelemsA; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      matrixA = rewriter.create<spirv::VectorInsertDynamicOp>(
+          loc, matrixA, extractedValA[i], idx);
+    }
+
+    llvm::SmallVector<Value, 8> extractedValB;
+    auto perWIelemsB =
+        llvm::cast<mlir::VectorType>(perWIVectorB.getType()).getNumElements();
+    for (unsigned i = 0; i < perWIelemsB; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      extractedValB.push_back(rewriter.create<spirv::VectorExtractDynamicOp>(
+          loc, perWIVectorB, idx));
+    }
+    // Putting all the extract and insert operations together, may make it
+    // easier for compiler (IGC) to reason about
+    for (unsigned i = 0; i < perWIelemsB; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      matrixB = rewriter.create<spirv::VectorInsertDynamicOp>(
+          loc, matrixB, extractedValB[i], idx);
+    }
+
+    llvm::SmallVector<Value, 8> extractedValC;
+    auto perWIelemsC =
+        llvm::cast<mlir::VectorType>(perWIVectorC.getType()).getNumElements();
+    for (unsigned i = 0; i < perWIelemsC; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      extractedValC.push_back(rewriter.create<spirv::VectorExtractDynamicOp>(
+          loc, perWIVectorC, idx));
+    }
+    // Putting all the extract and insert operations together, may make it
+    // easier for compiler (IGC) to reason about
+    for (unsigned i = 0; i < perWIelemsC; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      matrixC = rewriter.create<spirv::VectorInsertDynamicOp>(
+          loc, matrixC, extractedValC[i], idx);
+    }
+
+    Value result = rewriter.create<spirv::INTELJointMatrixMadOp>(
+        loc, matrixA, matrixB, matrixC, spirv::Scope::Subgroup);
+
+    Value payLoad =
+        rewriter.create<spirv::UndefOp>(loc, perWIVectorC.getType());
+    llvm::SmallVector<Value, 8> extractedValResult;
+    auto perWIelemsResult = perWIelemsC;
+    for (unsigned i = 0; i < perWIelemsResult; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      extractedValResult.push_back(
+          rewriter.create<spirv::VectorExtractDynamicOp>(loc, result, idx));
+    }
+    for (unsigned i = 0; i < perWIelemsResult; i++) {
+      auto idx = createConstantI32(loc, rewriter, i);
+      payLoad = rewriter.create<spirv::VectorInsertDynamicOp>(
+          loc, payLoad, extractedValResult[i], idx);
+    }
+    rewriter.replaceOp(op, payLoad);
+    return success();
+  }
+};
+
+} // namespace
+
+void imex::populateXeGPUToJointMatrixPatterns(SPIRVTypeConverter &typeConverter,
+                                              RewritePatternSet &patterns) {
+  patterns.add<CreateNdDescToJointMatrix, UpdateNDOffsetJointMatrix,
+               LoadNDJointMatrix, StoreNDJointMatrix, DpasJointMatrix,
+               ::VectorShapeCast>(typeConverter, patterns.getContext());
+}
diff --git a/test/Conversion/GPUToSPIRV/loadstore.mlir b/test/Conversion/GPUToSPIRV/loadstore.mlir
index e2cb41506..a6393fe8a 100644
--- a/test/Conversion/GPUToSPIRV/loadstore.mlir
+++ b/test/Conversion/GPUToSPIRV/loadstore.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -allow-unregistered-dialect -split-input-file -imex-convert-gpu-to-spirv -verify-diagnostics %s -o - | FileCheck %s
+// RUN: imex-opt -allow-unregistered-dialect -split-input-file -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' -verify-diagnostics %s -o - | FileCheck %s
 
 module attributes {
   gpu.container_module,
diff --git a/test/Conversion/GPUToSPIRV/scf.mlir b/test/Conversion/GPUToSPIRV/scf.mlir
index 2f89e5ce1..77d21877e 100644
--- a/test/Conversion/GPUToSPIRV/scf.mlir
+++ b/test/Conversion/GPUToSPIRV/scf.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -allow-unregistered-dialect -imex-convert-gpu-to-spirv -verify-diagnostics %s -o - | FileCheck %s
+// RUN: imex-opt -allow-unregistered-dialect -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' -verify-diagnostics %s -o - | FileCheck %s
 
 module attributes {
     gpu.container_module,
diff --git a/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir b/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir
index 5e0122015..a06272f10 100644
--- a/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir
+++ b/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s --check-prefix=CHECK
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s --check-prefix=CHECK
 module @gemm attributes {gpu.container_module} {
   memref.global "private" @__constant_8x16xf32 : memref<8x16xf32> = dense<4.000000e-01>
   func.func @test(%arg0: memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
diff --git a/test/Conversion/XeGPUToSPIRV/barrier_basic.vc.mlir b/test/Conversion/XeGPUToSPIRV/barrier_basic.vc.mlir
index 0d72915de..bd8103b18 100644
--- a/test/Conversion/XeGPUToSPIRV/barrier_basic.vc.mlir
+++ b/test/Conversion/XeGPUToSPIRV/barrier_basic.vc.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
   memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir
index 3c0ca946b..19113c35b 100644
--- a/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir
+++ b/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=false'  %s | FileCheck %s
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-genisa-intrinsic=true'  %s | FileCheck %s
 
 #sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
 #sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir
index a4c12ec46..47570ea76 100644
--- a/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir
+++ b/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir
@@ -1,5 +1,5 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s
-// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s --check-prefix=LSC
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s
+// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s --check-prefix=LSC
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
   memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir
index 39569d345..0b84b4a3e 100644
--- a/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir
+++ b/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir
@@ -1,5 +1,5 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s --check-prefix=CHECK-RAW
-// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s --check-prefix=CHECK-LSC
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s --check-prefix=CHECK-RAW
+// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s --check-prefix=CHECK-LSC
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
   memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.vc.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.vc.mlir
index ee7986859..859fba343 100644
--- a/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.vc.mlir
+++ b/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.vc.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s --check-prefix=CHECK-RAW
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s --check-prefix=CHECK-RAW
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
   memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
diff --git a/test/Conversion/XeGPUToSPIRV/lit.local.cfg b/test/Conversion/XeGPUToSPIRV/lit.local.cfg
index d23a14a3b..fb1018074 100644
--- a/test/Conversion/XeGPUToSPIRV/lit.local.cfg
+++ b/test/Conversion/XeGPUToSPIRV/lit.local.cfg
@@ -1,4 +1,5 @@
 local_excludes = [
                     'gemm_basic.mlir'
                  ]
-config.excludes.update(local_excludes)
+if(not config.imex_enable_excluded_tests):
+  config.excludes.update(local_excludes)
diff --git a/test/Conversion/XeGPUToSPIRV/update_offset.vc.mlir b/test/Conversion/XeGPUToSPIRV/update_offset.vc.mlir
index bb5905b13..2d46b0c90 100644
--- a/test/Conversion/XeGPUToSPIRV/update_offset.vc.mlir
+++ b/test/Conversion/XeGPUToSPIRV/update_offset.vc.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
   memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
diff --git a/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir b/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir
index 36d438780..b51354e01 100644
--- a/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir
+++ b/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true'  %s | FileCheck %s
 
 gpu.module @test attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
   // CHECK: spirv.ConvertPtrToU
diff --git a/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir b/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir
new file mode 100644
index 000000000..1fb4c02b3
--- /dev/null
+++ b/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir
@@ -0,0 +1,139 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm-joint-matrix.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm-joint-matrix.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+
+// NOTE: This test case provides an end-to-end example of XeGPU SIMT mode ops to SPIR-V JointMatrix ops lowering
+
+#sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
+#sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
+#sg_map_fp16_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
+module @gemm attributes {gpu.container_module} {
+  func.func @test(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %A_gpu = gpu.alloc  host_shared () : memref<1024x1024xf16>
+    memref.copy %A, %A_gpu : memref<1024x1024xf16> to memref<1024x1024xf16>
+    %B_gpu = gpu.alloc  host_shared () : memref<1024x1024xf16>
+    memref.copy %B, %B_gpu : memref<1024x1024xf16> to memref<1024x1024xf16>
+    %C_gpu = gpu.alloc  host_shared () : memref<1024x1024xf32>
+    memref.copy %C, %C_gpu : memref<1024x1024xf32> to memref<1024x1024xf32>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c128, %c64, %c1) threads in (%c1, %c16, %c1) args(%A_gpu : memref<1024x1024xf16>, %B_gpu : memref<1024x1024xf16>, %C_gpu : memref<1024x1024xf32>)
+    gpu.dealloc  %A_gpu : memref<1024x1024xf16>
+    gpu.dealloc  %B_gpu : memref<1024x1024xf16>
+    return %C_gpu : memref<1024x1024xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorAnyINTEL, JointMatrixINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_joint_matrix]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf16>, %c: memref<1024x1024xf32>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c64 = arith.constant 64 : index
+      %c1024 = arith.constant 1024 : index
+
+      %block_id_x = gpu.block_id x
+      %block_id_y = gpu.block_id y
+      %m = arith.muli %block_id_x, %c8 : index
+      %n = arith.muli %block_id_y, %c16 : index
+
+      %1 = xegpu.create_nd_tdesc %a[%m, %c0] {boundary_check = false} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>
+      %2 = xegpu.create_nd_tdesc %b[%c0, %n] {boundary_check = false} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>
+      %tmpC = arith.constant dense<0.0> : vector<8xf32>
+      %3 = vector.shape_cast %tmpC : vector<8xf32> to vector<8x1xf32>
+      %tmp0, %tmp1, %result = scf.for %k= %c0 to %c1024 step %c16 iter_args(%subA = %1, %subB = %2, %subC = %3)
+              -> (!xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>, !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>, vector<8x1xf32>) {
+        %4 = xegpu.load_nd %subA {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<8x1x1xf16>
+        %5 = xegpu.load_nd %subB {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16>
+        %6 = xegpu.dpas %4, %5, %subC  : vector<8x1x1xf16>, vector<8x1x2xf16>, vector<8x1xf32> -> vector<8x1xf32>
+        %7 = xegpu.update_nd_offset %subA, [%c0, %c16] : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>
+            -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>
+        %8 = xegpu.update_nd_offset %subB, [%c16, %c0] : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>
+            -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>
+        scf.yield %7, %8, %6: !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>, !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>, vector<8x1xf32>
+      }
+      %9 = xegpu.create_nd_tdesc %c[%m, %n] {boundary_check = false} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
+      xegpu.store_nd %result, %9 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1024 = arith.constant 1024 : index
+    %cf_0 = arith.constant 0.0 : f16
+    %cf_1 = arith.constant 1.0 : f16
+    %A = memref.alloc() : memref<1024x1024xf16>
+    %B = memref.alloc() : memref<1024x1024xf16>
+    %C = memref.alloc() : memref<1024x1024xf32>
+    %C_ref = memref.alloc() : memref<1024x1024xf32>
+    // intialize matrix A ; A[i, j] = j
+    scf.for %i = %c0 to %c1024 step %c1 {
+      scf.for %j = %c0 to %c1024 step %c1 {
+        %t = index.castu %j : index to i16
+        %val = arith.uitofp %t : i16 to f16
+        memref.store %val, %A[%i, %j] : memref<1024x1024xf16>
+      }
+    }
+    // make matrix B an identity matrix
+    scf.for %i = %c0 to %c1024 step %c1 {
+      scf.for %j = %c0 to %c1024 step %c1 {
+        %i_i32 = index.castu %i : index to i32
+        %j_i32 = index.castu %j : index to i32
+        %i_j_same = arith.cmpi eq, %i_i32, %j_i32 : i32
+
+        scf.if %i_j_same {
+          memref.store %cf_1, %B[%i, %j] : memref<1024x1024xf16>
+        } else {
+          memref.store %cf_0, %B[%i, %j] : memref<1024x1024xf16>
+        }
+      }
+    }
+    // intialize matrix C and C_ref ; C[i, j] = 0
+    %c0_f32 = arith.constant 0.0 : f32
+    scf.for %i = %c0 to %c1024 step %c1 {
+      scf.for %j = %c0 to %c1024 step %c1 {
+        memref.store %c0_f32, %C[%i, %j] : memref<1024x1024xf32>
+        memref.store %c0_f32, %C_ref[%i, %j] : memref<1024x1024xf32>
+      }
+    }
+    // compute C for reference
+    scf.for %i = %c0 to %c1024 step %c1 {
+      scf.for %j = %c0 to %c1024 step %c1 {
+        %c_curr = memref.load %C_ref[%i, %j] : memref<1024x1024xf32>
+        %c_val = scf.for %k = %c0 to %c1024 step %c1 iter_args(%c_partial = %c_curr) -> f32 {
+          %a_val = memref.load %A[%i, %k] : memref<1024x1024xf16>
+          %b_val = memref.load %B[%k, %j] : memref<1024x1024xf16>
+          %t = arith.mulf %a_val, %b_val : f16
+          %t_cast = arith.extf %t : f16 to f32
+          %c_sum = arith.addf %t_cast, %c_partial : f32
+          scf.yield %c_sum : f32
+        }
+        memref.store %c_val , %C_ref[%i, %j] : memref<1024x1024xf32>
+      }
+    }
+    %2 = call @test(%A, %B, %C) : (memref<1024x1024xf16>, memref<1024x1024xf16>, memref<1024x1024xf32>) -> memref<1024x1024xf32>
+    %cast_C = memref.cast %2 : memref<1024x1024xf32> to memref<*xf32>
+    %cast_C_ref = memref.cast %C_ref : memref<1024x1024xf32> to memref<*xf32>
+    // call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
+    // call @printMemrefF32(%cast_C_ref) : (memref<*xf32>) -> ()
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%cast_C, %cast_C_ref) : (memref<*xf32>, memref<*xf32>) -> ()
+    memref.dealloc %A : memref<1024x1024xf16>
+    memref.dealloc %B : memref<1024x1024xf16>
+    memref.dealloc %C : memref<1024x1024xf32>
+    memref.dealloc %C_ref : memref<1024x1024xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/XeGPU/lit.local.cfg b/test/Integration/Dialect/XeGPU/lit.local.cfg
index e084b0d12..ab78e6edf 100644
--- a/test/Integration/Dialect/XeGPU/lit.local.cfg
+++ b/test/Integration/Dialect/XeGPU/lit.local.cfg
@@ -1,9 +1,11 @@
 local_excludes = [
                     'gemm_1024x1024xf16.mlir',
                     'gemm_1024x1024xf16.using.updateoffset.mlir',
+                    'gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir',
                     'gemm_1024x1016x1016_f16_f16_f32.mlir',
                     'load2d_dpas_store2d.mlir',
                     'load2d-padding-f32.mlir',
                     'load2d-padding.mlir'
                  ]
-config.excludes.update(local_excludes)
+if(not config.imex_enable_excluded_tests):
+  config.excludes.update(local_excludes)
diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-llvm-joint-matrix.pp b/test/Integration/Dialect/XeGPU/xegpu-to-llvm-joint-matrix.pp
new file mode 100644
index 000000000..e4b615146
--- /dev/null
+++ b/test/Integration/Dialect/XeGPU/xegpu-to-llvm-joint-matrix.pp
@@ -0,0 +1,25 @@
+// linalg dialect to gpu dialect lowering pipeline
+// Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect.
+builtin.module(
+    imex-convert-gpu-to-spirv{enable-joint-matrix=true}
+    canonicalize
+    spirv.module(spirv-lower-abi-attrs
+             spirv-update-vce)
+    func.func(llvm-request-c-wrappers)
+    serialize-spirv
+    convert-vector-to-scf
+    convert-gpu-to-gpux
+    convert-scf-to-cf
+    convert-cf-to-llvm
+    convert-vector-to-llvm
+    convert-index-to-llvm
+    convert-arith-to-llvm
+    convert-func-to-llvm
+    convert-math-to-llvm
+    convert-gpux-to-llvm
+    convert-index-to-llvm
+    expand-strided-metadata
+    lower-affine
+    finalize-memref-to-llvm
+    reconcile-unrealized-casts)
+// End
diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp b/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp
index bc7826608..72f8264b2 100644
--- a/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp
+++ b/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp
@@ -1,7 +1,7 @@
 // linalg dialect to gpu dialect lowering pipeline
 // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect.
 builtin.module(
-    imex-convert-gpu-to-spirv
+    imex-convert-gpu-to-spirv{enable-vc-intrinsic=true}
     spirv.module(spirv-lower-abi-attrs
              spirv-update-vce)
     func.func(llvm-request-c-wrappers)
diff --git a/test/SPIRV/IntelVectorExtension/lit.local.cfg b/test/SPIRV/IntelVectorExtension/lit.local.cfg
index 00604223a..3cc7d0832 100644
--- a/test/SPIRV/IntelVectorExtension/lit.local.cfg
+++ b/test/SPIRV/IntelVectorExtension/lit.local.cfg
@@ -6,4 +6,5 @@ local_excludes = ['DPAS_Dynamic_Size_BF16.mlir',
                   'Load_2d_raw_send.mlir',
                   'Store2d_raw_send.mlir',
                   'GEMM_4kx4kx4k_BF16.mlir']
-config.excludes.update(local_excludes)
+if(not config.imex_enable_excluded_tests):
+  config.excludes.update(local_excludes)