From 5f215895a96cca65519cbd6a528d5b7bd3ff5f19 Mon Sep 17 00:00:00 2001 From: Md Abdullah Shahneous Bari Date: Tue, 21 Nov 2023 14:46:41 -0800 Subject: [PATCH] [xegpu][spirv] Add xegpu.simt to spirv JoinMatrixINTEL lowering & and E2E XeGPU.SIMT GEMM test case Supported op: xegpu.create_nd_descriptor xegpu.update_nd_offset xegpu.load_nd xegpu.store_nd xegpu.dpas Add an end-to-end GEMM test case for XeGPU.SIMT GEMM parameters in the test case: Matrix A = 1024x1024xf16 Matrix B = 1024x1024xf16 Matrix C = 1024x1024xf32 --- include/imex/Conversion/Passes.td | 6 +- .../Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h | 3 + lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp | 109 +++- lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp | 510 ++++++++++++++++++ test/Conversion/GPUToSPIRV/loadstore.mlir | 2 +- test/Conversion/GPUToSPIRV/scf.mlir | 2 +- .../XeGPUToSPIRV/atomic_basic.vc.mlir | 2 +- .../XeGPUToSPIRV/barrier_basic.vc.mlir | 2 +- test/Conversion/XeGPUToSPIRV/gemm_basic.mlir | 2 +- .../XeGPUToSPIRV/gemm_basic.vc.mlir | 4 +- .../XeGPUToSPIRV/gemm_basic_1d.vc.mlir | 4 +- .../XeGPUToSPIRV/gemm_basic_gather.vc.mlir | 2 +- test/Conversion/XeGPUToSPIRV/lit.local.cfg | 3 +- .../XeGPUToSPIRV/update_offset.vc.mlir | 2 +- test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir | 2 +- .../gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir | 139 +++++ test/Integration/Dialect/XeGPU/lit.local.cfg | 4 +- .../XeGPU/xegpu-to-llvm-joint-matrix.pp | 25 + .../Dialect/XeGPU/xegpu-to-llvm.pp | 2 +- test/SPIRV/IntelVectorExtension/lit.local.cfg | 3 +- 20 files changed, 782 insertions(+), 46 deletions(-) create mode 100644 test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir create mode 100644 test/Integration/Dialect/XeGPU/xegpu-to-llvm-joint-matrix.pp diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td index 2766c91c5..36b4830f0 100644 --- a/include/imex/Conversion/Passes.td +++ b/include/imex/Conversion/Passes.td @@ -251,7 +251,11 @@ memref, arith and math. let constructor = "imex::createConvertGPUXToSPIRVPass()"; let dependentDialects = ["::mlir::spirv::SPIRVDialect"]; let options = [ - Option<"enableVCIntrinsic", "enable-vc-intrinsic","bool", "true", + Option<"enableJointMatrix", "enable-joint-matrix","bool", "false", + "Enable XeGPU SIMT mode Ops lowered to JointMatrix based Ops">, + Option<"enableGenISAIntrinsic", "enable-genisa-intrinsic","bool", "false", + "Enable XeGPU SIMT mode Ops lowered to JointMatrix based Ops">, + Option<"enableVCIntrinsic", "enable-vc-intrinsic","bool", "false", "Enable XeGPU Ops lowered to intel vc Intrinsics"> ]; } diff --git a/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h b/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h index 91615dbad..5ecfc4778 100644 --- a/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h +++ b/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h @@ -30,6 +30,9 @@ void populateXeGPUToVCIntrinsicsPatterns( // XeGPU to genISA Intrinsics pattern void populateXeGPUToGenISAPatterns(mlir::SPIRVTypeConverter &typeConverter, mlir::RewritePatternSet &patterns); +// XeGPU to JointMatrix pattern +void populateXeGPUToJointMatrixPatterns(mlir::SPIRVTypeConverter &typeConverter, + mlir::RewritePatternSet &patterns); } // namespace imex #endif // IMEX_CONVERSION_XEGPUTOSPIRV_H diff --git a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp index 589b3b033..17742bba5 100644 --- a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp +++ b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp @@ -156,35 +156,79 @@ void GPUXToSPIRVPass::runOnOperation() { eraseOp->erase(); } target->addIllegalDialect(); - typeConverter.addConversion([&](xegpu::NbarrierType type) -> ::mlir::Type { - auto i32Type = ::mlir::IntegerType::get(context, 32); - return mlir::VectorType::get(8, i32Type); - }); - typeConverter.addConversion( - [&](xegpu::TensorDescType type) -> ::mlir::Type { - auto i32Type = ::mlir::IntegerType::get(context, 32); - return ::mlir::VectorType::get(8, i32Type); - }); - typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type { - unsigned rank = type.getRank(); - auto elemType = type.getElementType(); - if (rank < 1) - return type; - else { - // load2d/store2d is vnni format with 3 dims - if (rank == 3 && elemType.getIntOrFloatBitWidth() < 32) { - elemType = ::mlir::IntegerType::get(context, 32); - rank--; + // Only one of the following options should be enabled. + if ((this->enableVCIntrinsic && this->enableGenISAIntrinsic) || + (this->enableVCIntrinsic && this->enableJointMatrix) || + (this->enableGenISAIntrinsic && this->enableJointMatrix)) + return signalPassFailure(); + if (this->enableJointMatrix) { + // Tensor descriptor conversion pattern for SIMT JointMatrix + typeConverter.addConversion( + [&](xegpu::TensorDescType type) -> ::mlir::spirv::StructType { + llvm::SmallVector<::mlir::Type, 4> memberTypes; + auto i64Type = ::mlir::IntegerType::get(context, 64); + // Default storage class is spirv::StorageClass::CrossWorkgroup + auto spirvStorageClass = + ::mlir::spirv::StorageClass::CrossWorkgroup; + if (type.getMemoryScope() == xegpu::MemoryScope::SLM) + spirvStorageClass = ::mlir::spirv::StorageClass::Workgroup; + auto baseAddressType = ::mlir::spirv::PointerType::get( + type.getElementType(), spirvStorageClass); + memberTypes.push_back(baseAddressType); + memberTypes.push_back(i64Type); + + for (int i = 0; i < type.getRank(); i++) { + memberTypes.push_back(i64Type); + } + return ::mlir::spirv::StructType::get(memberTypes); + }); + typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type { + unsigned rank = type.getRank(); + auto elemType = type.getElementType(); + if (rank < 1) + return type; + else { + unsigned sum = 1; + for (unsigned i = 0; i < rank; i++) { + sum *= type.getShape()[i]; + } + if (llvm::isa(elemType)) + elemType = ::mlir::IntegerType::get(context, 64); + return ::mlir::VectorType::get(sum, elemType); } - unsigned sum = 1; - for (unsigned i = 0; i < rank; i++) { - sum *= type.getShape()[i]; + }); + } else { + typeConverter.addConversion( + [&](xegpu::TensorDescType type) -> ::mlir::Type { + auto i32Type = ::mlir::IntegerType::get(context, 32); + return ::mlir::VectorType::get(8, i32Type); + }); + typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type { + unsigned rank = type.getRank(); + auto elemType = type.getElementType(); + if (rank < 1) + return type; + else { + // load2d/store2d is vnni format with 3 dims + if (rank == 3 && elemType.getIntOrFloatBitWidth() < 32) { + elemType = ::mlir::IntegerType::get(context, 32); + rank--; + } + unsigned sum = 1; + for (unsigned i = 0; i < rank; i++) { + sum *= type.getShape()[i]; + } + if (llvm::isa(elemType)) + elemType = ::mlir::IntegerType::get(context, 64); + return ::mlir::VectorType::get(sum, elemType); } - if (llvm::isa(elemType)) - elemType = ::mlir::IntegerType::get(context, 64); - return ::mlir::VectorType::get(sum, elemType); - } - }); + }); + typeConverter.addConversion( + [&](xegpu::NbarrierType type) -> ::mlir::Type { + auto i32Type = ::mlir::IntegerType::get(context, 32); + return mlir::VectorType::get(8, i32Type); + }); + } //------- Upstream Conversion------------ mlir::populateGPUToSPIRVPatterns(typeConverter, patterns); @@ -200,9 +244,16 @@ void GPUXToSPIRVPass::runOnOperation() { mlir::populateMathToSPIRVPatterns(typeConverter, patterns); if (this->enableVCIntrinsic) imex::populateXeGPUToVCIntrinsicsPatterns(typeConverter, patterns); - else + else if (this->enableJointMatrix) + imex::populateXeGPUToJointMatrixPatterns(typeConverter, patterns); + else if (this->enableGenISAIntrinsic) imex::populateXeGPUToGenISAPatterns(typeConverter, patterns); - + else + module.emitOpError( + "'-imex-convert-gpu-to-spirv' pass must be run with one of the " + "following options to be 'true': " + "'enable-vc-intrinsic', 'enable-joint-matrix', " + "'enable-genisa-intrinsic'"); if (failed(applyFullConversion(gpuModule, *target, std::move(patterns)))) return signalPassFailure(); } diff --git a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp index c803c3b92..b4e86fcc8 100644 --- a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp +++ b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp @@ -1438,3 +1438,513 @@ void imex::populateXeGPUToGenISAPatterns(SPIRVTypeConverter &typeConverter, LoadStorePrefetchNdToGenISA>( typeConverter, patterns.getContext()); } + +namespace { +// PVC-specific subgroup size for JointMatrix +constexpr uint64_t jointMatrixSubGroupSize = 16; +// Calculate flattened offsets +// Calculate flattened offsets based on dims and offsets(indices) +Value linearizeOffset(OpBuilder builder, Location loc, + SmallVectorImpl &offsets, + SmallVectorImpl &dims) { + assert(offsets.size() == dims.size() && + "number of offsets & dimensions must be same"); + auto createIntConstant = [&](Type type, unsigned value) { + auto attr = builder.getIntegerAttr(type, value); + return builder.create(loc, type, attr); + }; + + auto i64Type = builder.getI64Type(); + auto rank = dims.size(); + Value linearizedOffset = createIntConstant(i64Type, 0); + for (unsigned i = 0; i < rank; i++) { + Value perDimstrideMultiplier = createIntConstant(i64Type, 1); + for (unsigned j = i + 1; j < rank; j++) { + perDimstrideMultiplier = builder.create( + loc, i64Type, perDimstrideMultiplier, dims[j]); + } + perDimstrideMultiplier = builder.create( + loc, i64Type, perDimstrideMultiplier, offsets[i]); + + linearizedOffset = builder.create( + loc, i64Type, linearizedOffset, perDimstrideMultiplier); + } + return linearizedOffset; +} + +unsigned getElementPerWI(imex::xegpu::TensorDescType tDescType) { + imex::xegpu::SubGroupMapAttr sgMap; + auto encoding = tDescType.getEncoding(); + if (auto xeMapAttr = llvm::dyn_cast(encoding)) { + sgMap = xeMapAttr.getSg(); + } else { + sgMap = llvm::dyn_cast(encoding); + } + auto blockSize = tDescType.getShape(); + auto wiLayout = sgMap.getWiLayout(); + auto wiData = sgMap.getWiData(); + unsigned elemPerWI = 1; + for (size_t i = 0; i < wiData.size(); i++) { + if (wiData[i] != 1) + llvm_unreachable("wi_data must be 1 for all dimension for " + "JointMatrix lowering"); + elemPerWI *= (blockSize[i] / wiLayout[i]); + } + return elemPerWI; +} + +class CreateNdDescToJointMatrix : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(CreateNdDescOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + assert(op.getBoundaryCheck() == false && + "for xegpu to joint matrix lowering boundary_check attribute must " + "be false"); + auto loc = op.getLoc(); + auto tileType = op.getTensorDesc().getType(); + auto rank = tileType.getRank(); + + // Set the SPIR-V Struct to represent the Tensor Descriptor + // The create_nd_tdesc returns a spirv.struct + // The elements in the struct contains the following elements + // element 0 = base address pointer : spirv.ptr + // element 1 = 1D offset : i64 + // element 2 = X Dim Size : i64 + // element 3 = Y Dim Size : i64 + // [SPIR-V lowering uses 1D flattened addresses passed as kernel parameters] + SmallVector memberTypes; + auto i64Type = rewriter.getI64Type(); + // Default storage class is spirv::StorageClass::CrossWorkgroup + auto spirvStorageClass = spirv::StorageClass::CrossWorkgroup; + // For memref use memref spirv storage attribute if available + auto srcType = op.getSourceType(); + if (llvm::isa(srcType)) { + auto sc = dyn_cast_or_null( + llvm::cast(srcType).getMemorySpace()); + if (sc) + spirvStorageClass = sc.getValue(); + } + auto spirvBaseAddressType = + spirv::PointerType::get(op.getSourceElementType(), spirvStorageClass); + + memberTypes.push_back(spirvBaseAddressType); + memberTypes.push_back(i64Type); + // For nD descriptor, dimesion=rank, so we need dimSize for all the + // dimensions + for (int i = 0; i < rank; i++) { + memberTypes.push_back(i64Type); + } + + auto ndDescStruct = spirv::StructType::get(memberTypes); + + Value payLoad = rewriter.create(loc, ndDescStruct); + auto createIntConstant = [&](Type type, unsigned value) { + auto attr = rewriter.getIntegerAttr(type, value); + return rewriter.create(loc, type, attr); + }; + + // Insert the base address to the ndDescStruct struct + Value genericBasePtr; + // If the base type is memref, add a bitcast op + // If the base type is not memref type, add a ConvertUToPtr op + if (llvm::isa(srcType)) { + genericBasePtr = rewriter.create( + loc, spirvBaseAddressType, adaptor.getSource()); + } else { + genericBasePtr = rewriter.create( + loc, spirvBaseAddressType, adaptor.getSource()); + } + + payLoad = rewriter.create( + loc, genericBasePtr, payLoad, llvm::ArrayRef(0)); + + // TODO: We should be able to use op.getOffsets() directly with index cast + // But we need support from XeGPU dialect definition to return i64_t + + auto createOffset = [&](unsigned idx) -> Value { + Value val; + if (ShapedType::isDynamic(op.getStaticOffsets()[idx])) { + val = op.getOffsets()[idx]; + // Cast index type to i64 + val = rewriter.create(loc, i64Type, val); + } else { + val = createIntConstant(i64Type, op.getStaticOffsets()[idx]); + } + return val; + }; + + // TODO: We should be able to use op.getShape() directly with index cast + // But we need support from XeGPU dialect definition to return i64_t + + auto createShape = [&](unsigned idx) -> Value { + Value val; + if (ShapedType::isDynamic(op.getStaticShape()[idx])) { + val = op.getShape()[idx]; + // Cast index type to i64 + val = rewriter.create(loc, i64Type, val); + } else { + val = createIntConstant(i64Type, op.getStaticShape()[idx]); + } + return val; + }; + + SmallVector nDOffsets; + SmallVector nDDims; + for (unsigned i = 0; i < rank; i++) { + nDOffsets.push_back(createOffset(i)); + } + + for (unsigned i = 0; i < rank; i++) { + nDDims.push_back(createShape(i)); + } + + // Calculate the 1-D offset, since the memrefs are flattened when + // passed to SPIR-V + Value linearizedOffset = linearizeOffset(rewriter, loc, nDOffsets, nDDims); + // Insert the flattened (1D) offset to the ndDescStruct struct + + payLoad = rewriter.create( + loc, linearizedOffset, payLoad, llvm::ArrayRef(1)); + for (int i = 0; i < rank; i++) { + payLoad = rewriter.create(loc, nDDims[i], + payLoad, (i + 2)); + } + rewriter.replaceOp(op, payLoad); + return success(); + } +}; + +class UpdateNDOffsetJointMatrix : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(UpdateNDOffsetOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto desc = adaptor.getTensorDesc(); + const int dimStartIdx = 2; + auto i64Type = rewriter.getI64Type(); + auto createIntConstant = [&](Type type, unsigned value) { + auto attr = rewriter.getIntegerAttr(type, value); + return rewriter.create(loc, type, attr); + }; + // Calculate the 1-D offset, since the memrefs are flattened when + // passed to SPIR-V + Value offset1D; + offset1D = createIntConstant(i64Type, 0); + auto offsets = adaptor.getOffsets(); + auto rank = op.getTensorDesc().getType().getRank(); + // number of offsets & tensorDescriptor rank must be same + assert(offsets.size() == (size_t)op.getTensorDesc().getType().getRank() && + "number of offsets & tensorDescriptor rank must be same"); + for (unsigned i = 0; i < rank; i++) { + Value perDimstrideMultiplier; + perDimstrideMultiplier = createIntConstant(i64Type, 1); + for (unsigned j = i + 1; j < rank; j++) { + Value dimSize = rewriter.create( + loc, desc, (j + dimStartIdx)); + perDimstrideMultiplier = rewriter.create( + loc, i64Type, perDimstrideMultiplier, dimSize); + } + // Cast index type to i64 + Value offsetVal = + rewriter.create(loc, i64Type, offsets[i]); + perDimstrideMultiplier = rewriter.create( + loc, i64Type, perDimstrideMultiplier, offsetVal); + + offset1D = rewriter.create(loc, i64Type, offset1D, + perDimstrideMultiplier); + } + + // Add the newOffset to previous offset + Value prev1DOffset = rewriter.create( + loc, desc, llvm::ArrayRef(1)); + offset1D = + rewriter.create(loc, i64Type, offset1D, prev1DOffset); + // Update the descriptor with the new offset + desc = rewriter.create(loc, offset1D, desc, + llvm::ArrayRef(1)); + rewriter.replaceOp(op, desc); + return success(); + } +}; + +class LoadNDJointMatrix : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(LoadNDOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (op.getTranspose()) + op.emitError("transpose is not currently supported for XeGPU to " + "JointMatrix lowering"); + auto loc = op.getLoc(); + auto tDesc = adaptor.getTensorDesc(); + auto tDescType = op.getTensorDesc().getType(); + int rank = tDescType.getRank(); + assert(rank == 2 && "only support 2d load for now"); + + // Get the base address + Value baseAddress = rewriter.create( + loc, tDesc, llvm::ArrayRef(0)); + // Get the offset + Value offset = rewriter.create( + loc, tDesc, llvm::ArrayRef(1)); + + SmallVector linearizedIndices; + // Get the load address + Value loadAddress = rewriter.create( + loc, baseAddress, offset, linearizedIndices); + + // Stride for jointMatrixLoad = Y Dim size + // TODO: what do we do for transpose case? + Value stride = rewriter.create( + loc, tDesc, llvm::ArrayRef(3)); + + // Figure out the Matrix Use type (MatrixA, MatrixB, Accumulator) + uint32_t matrixUse; + // Don't expect vnni axis to be set for the Accumulator + + if (auto vnniAxis = adaptor.getVnniAxis()) + // vnniAxis 0 -> MatrixB -> matrixUse = 1 + // vnniAxis 1 -> MatrixA -> matrixUse = 0 + matrixUse = (*vnniAxis + 1) % 2; + else + // vnniAxis empty -> Accumulator -> matrixUse = 2 + matrixUse = 2; + + // TODO: Need to discuss how to handle transpose, load then transpose or + // transposed load? + auto jointMatrixtype = spirv::JointMatrixINTELType::get( + tDescType.getElementType(), spirv::Scope::Subgroup, + tDescType.getDimSize(0), tDescType.getDimSize(1), + spirv::MatrixLayout::RowMajor, *spirv::symbolizeMatrixUse(matrixUse)); + + auto jointMatrixLoaded = rewriter.create( + loc, jointMatrixtype, loadAddress, stride, + ::mlir::spirv::MatrixLayout::RowMajor, ::mlir::spirv::Scope::Subgroup, + nullptr, nullptr); + + // TODO: Once architecture-spcific info are in place, add subgroup_size + // restriction verification + unsigned elemPerWI = getElementPerWI(tDescType); + auto elemType = tDescType.getElementType(); + auto perWIVectorType = VectorType::get(elemPerWI, elemType); + Value payLoad = rewriter.create(loc, perWIVectorType); + llvm::SmallVector extractedVal; + for (unsigned i = 0; i < elemPerWI; i++) { + auto idx = createConstantI32(loc, rewriter, i); + extractedVal.push_back(rewriter.create( + loc, jointMatrixLoaded, idx)); + } + + // Putting all the extract and insert operations together, may make it + // easier for compiler (IGC) to reason about + for (unsigned i = 0; i < elemPerWI; i++) { + auto idx = createConstantI32(loc, rewriter, i); + payLoad = rewriter.create( + loc, payLoad, extractedVal[i], idx); + } + rewriter.replaceOp(op, payLoad); + return success(); + } +}; + +class StoreNDJointMatrix : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(StoreNDOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto tDesc = adaptor.getTensorDesc(); + auto tDescType = op.getTensorDesc().getType(); + int rank = tDescType.getRank(); + assert(rank == 2 && "only support 2d load for now"); + + // Get the base address + Value baseAddress = rewriter.create( + loc, tDesc, llvm::ArrayRef(0)); + // Get the offset + Value offset = rewriter.create( + loc, tDesc, llvm::ArrayRef(1)); + + SmallVector linearizedIndices; + // Get the load address + Value loadAddress = rewriter.create( + loc, baseAddress, offset, linearizedIndices); + + // Stride for jointMatrixLoad = Y Dim size + // TODO: what do we do for transpose case? + Value stride = rewriter.create( + loc, tDesc, llvm::ArrayRef(3)); + + // For Store, we only allow Accumulator type matrix to store. + // TODO: We need to Add option on the xegpu.store_nd to support storing B + // matrix for that we need to add vnni_axis attribute to store_nd op as + // well. + uint32_t matrixUse = 2; + // Don't expect vnni axis to be set for the Accumulator + auto jointMatrixtype = spirv::JointMatrixINTELType::get( + tDescType.getElementType(), spirv::Scope::Subgroup, + tDescType.getDimSize(0), tDescType.getDimSize(1), + spirv::MatrixLayout::RowMajor, *spirv::symbolizeMatrixUse(matrixUse)); + Value matrix = rewriter.create(loc, jointMatrixtype); + + // TODO: Once architecture-spcific info are in place, add subgroup_size + // restriction verification + unsigned elemPerWI = getElementPerWI(tDescType); + // auto elemType = tDescType.getElementType(); + // Get the 2D vector + auto perWIVector = adaptor.getValue(); + llvm::SmallVector extractedVal; + for (unsigned i = 0; i < elemPerWI; i++) { + auto idx = createConstantI32(loc, rewriter, i); + extractedVal.push_back(rewriter.create( + loc, perWIVector, idx)); + } + + // Putting all the extract and insert operations together, may make it + // easier for compiler (IGC) to reason about + for (unsigned i = 0; i < elemPerWI; i++) { + auto idx = createConstantI32(loc, rewriter, i); + matrix = rewriter.create( + loc, matrix, extractedVal[i], idx); + } + auto payLoad = rewriter.create( + loc, loadAddress, matrix, stride, ::mlir::spirv::MatrixLayout::RowMajor, + ::mlir::spirv::Scope::Subgroup, nullptr, nullptr); + rewriter.replaceOp(op, payLoad); + return success(); + } +}; + +class DpasJointMatrix : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(DpasOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto vectorA = op.getLhs(); + auto vectorB = op.getRhs(); + auto vectorC = op.getAcc(); + { + OpBuilder::InsertionGuard guard(rewriter); + auto func = op->getParentOfType(); + rewriter.setInsertionPointAfter(func); + rewriter.create( + op.getLoc(), func, spirv::ExecutionMode::SubgroupSize, + int(jointMatrixSubGroupSize)); + } + // Matrix row = 1st dim of input vector + // Matrix colomn = 2nd dim of input vector * jointMatrixSubGroupSize + auto matrixAType = spirv::JointMatrixINTELType::get( + vectorA.getType().getElementType(), spirv::Scope::Subgroup, + vectorA.getType().getShape()[0], + vectorA.getType().getShape()[1] * jointMatrixSubGroupSize, + spirv::MatrixLayout::RowMajor, spirv::MatrixUse::MatrixA); + + // B matrix vector is passed VNNI-transformed, so row = dim0 *dim3 + auto matrixBType = spirv::JointMatrixINTELType::get( + vectorB.getType().getElementType(), spirv::Scope::Subgroup, + vectorB.getType().getShape()[0] * vectorB.getType().getShape()[2], + vectorB.getType().getShape()[1] * jointMatrixSubGroupSize, + spirv::MatrixLayout::RowMajor, spirv::MatrixUse::MatrixB); + + auto matrixCType = spirv::JointMatrixINTELType::get( + vectorC.getType().getElementType(), spirv::Scope::Subgroup, + vectorC.getType().getShape()[0], + vectorC.getType().getShape()[1] * jointMatrixSubGroupSize, + spirv::MatrixLayout::RowMajor, spirv::MatrixUse::Accumulator); + + Value matrixA = rewriter.create(loc, matrixAType); + Value matrixB = rewriter.create(loc, matrixBType); + Value matrixC = rewriter.create(loc, matrixCType); + // Create Matrices from the vectors + // Get the flattened vectors through the adaptor, since SPIRV only allows 1D + // vector + auto perWIVectorA = adaptor.getLhs(); + auto perWIVectorB = adaptor.getRhs(); + auto perWIVectorC = adaptor.getAcc(); + + llvm::SmallVector extractedValA; + auto perWIelemsA = + llvm::cast(perWIVectorA.getType()).getNumElements(); + for (unsigned i = 0; i < perWIelemsA; i++) { + auto idx = createConstantI32(loc, rewriter, i); + extractedValA.push_back(rewriter.create( + loc, perWIVectorA, idx)); + } + // Putting all the extract and insert operations together, may make it + // easier for compiler (IGC) to reason about + for (unsigned i = 0; i < perWIelemsA; i++) { + auto idx = createConstantI32(loc, rewriter, i); + matrixA = rewriter.create( + loc, matrixA, extractedValA[i], idx); + } + + llvm::SmallVector extractedValB; + auto perWIelemsB = + llvm::cast(perWIVectorB.getType()).getNumElements(); + for (unsigned i = 0; i < perWIelemsB; i++) { + auto idx = createConstantI32(loc, rewriter, i); + extractedValB.push_back(rewriter.create( + loc, perWIVectorB, idx)); + } + // Putting all the extract and insert operations together, may make it + // easier for compiler (IGC) to reason about + for (unsigned i = 0; i < perWIelemsB; i++) { + auto idx = createConstantI32(loc, rewriter, i); + matrixB = rewriter.create( + loc, matrixB, extractedValB[i], idx); + } + + llvm::SmallVector extractedValC; + auto perWIelemsC = + llvm::cast(perWIVectorC.getType()).getNumElements(); + for (unsigned i = 0; i < perWIelemsC; i++) { + auto idx = createConstantI32(loc, rewriter, i); + extractedValC.push_back(rewriter.create( + loc, perWIVectorC, idx)); + } + // Putting all the extract and insert operations together, may make it + // easier for compiler (IGC) to reason about + for (unsigned i = 0; i < perWIelemsC; i++) { + auto idx = createConstantI32(loc, rewriter, i); + matrixC = rewriter.create( + loc, matrixC, extractedValC[i], idx); + } + + Value result = rewriter.create( + loc, matrixA, matrixB, matrixC, spirv::Scope::Subgroup); + + Value payLoad = + rewriter.create(loc, perWIVectorC.getType()); + llvm::SmallVector extractedValResult; + auto perWIelemsResult = perWIelemsC; + for (unsigned i = 0; i < perWIelemsResult; i++) { + auto idx = createConstantI32(loc, rewriter, i); + extractedValResult.push_back( + rewriter.create(loc, result, idx)); + } + for (unsigned i = 0; i < perWIelemsResult; i++) { + auto idx = createConstantI32(loc, rewriter, i); + payLoad = rewriter.create( + loc, payLoad, extractedValResult[i], idx); + } + rewriter.replaceOp(op, payLoad); + return success(); + } +}; + +} // namespace + +void imex::populateXeGPUToJointMatrixPatterns(SPIRVTypeConverter &typeConverter, + RewritePatternSet &patterns) { + patterns.add(typeConverter, patterns.getContext()); +} diff --git a/test/Conversion/GPUToSPIRV/loadstore.mlir b/test/Conversion/GPUToSPIRV/loadstore.mlir index e2cb41506..a6393fe8a 100644 --- a/test/Conversion/GPUToSPIRV/loadstore.mlir +++ b/test/Conversion/GPUToSPIRV/loadstore.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt -allow-unregistered-dialect -split-input-file -imex-convert-gpu-to-spirv -verify-diagnostics %s -o - | FileCheck %s +// RUN: imex-opt -allow-unregistered-dialect -split-input-file -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' -verify-diagnostics %s -o - | FileCheck %s module attributes { gpu.container_module, diff --git a/test/Conversion/GPUToSPIRV/scf.mlir b/test/Conversion/GPUToSPIRV/scf.mlir index 2f89e5ce1..77d21877e 100644 --- a/test/Conversion/GPUToSPIRV/scf.mlir +++ b/test/Conversion/GPUToSPIRV/scf.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt -allow-unregistered-dialect -imex-convert-gpu-to-spirv -verify-diagnostics %s -o - | FileCheck %s +// RUN: imex-opt -allow-unregistered-dialect -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' -verify-diagnostics %s -o - | FileCheck %s module attributes { gpu.container_module, diff --git a/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir b/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir index 5e0122015..a06272f10 100644 --- a/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir +++ b/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=CHECK +// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s --check-prefix=CHECK module @gemm attributes {gpu.container_module} { memref.global "private" @__constant_8x16xf32 : memref<8x16xf32> = dense<4.000000e-01> func.func @test(%arg0: memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { diff --git a/test/Conversion/XeGPUToSPIRV/barrier_basic.vc.mlir b/test/Conversion/XeGPUToSPIRV/barrier_basic.vc.mlir index 0d72915de..bd8103b18 100644 --- a/test/Conversion/XeGPUToSPIRV/barrier_basic.vc.mlir +++ b/test/Conversion/XeGPUToSPIRV/barrier_basic.vc.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s +// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s module @gemm attributes {gpu.container_module} { memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir index 3c0ca946b..19113c35b 100644 --- a/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir +++ b/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=false' %s | FileCheck %s +// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-genisa-intrinsic=true' %s | FileCheck %s #sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}> #sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}> diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir index a4c12ec46..47570ea76 100644 --- a/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir +++ b/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir @@ -1,5 +1,5 @@ -// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s -// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=LSC +// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s +// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s --check-prefix=LSC module @gemm attributes {gpu.container_module} { memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir index 39569d345..0b84b4a3e 100644 --- a/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir +++ b/test/Conversion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir @@ -1,5 +1,5 @@ -// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=CHECK-RAW -// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=CHECK-LSC +// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s --check-prefix=CHECK-RAW +// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s --check-prefix=CHECK-LSC module @gemm attributes {gpu.container_module} { memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.vc.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.vc.mlir index ee7986859..859fba343 100644 --- a/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.vc.mlir +++ b/test/Conversion/XeGPUToSPIRV/gemm_basic_gather.vc.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=CHECK-RAW +// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s --check-prefix=CHECK-RAW module @gemm attributes {gpu.container_module} { memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> diff --git a/test/Conversion/XeGPUToSPIRV/lit.local.cfg b/test/Conversion/XeGPUToSPIRV/lit.local.cfg index d23a14a3b..fb1018074 100644 --- a/test/Conversion/XeGPUToSPIRV/lit.local.cfg +++ b/test/Conversion/XeGPUToSPIRV/lit.local.cfg @@ -1,4 +1,5 @@ local_excludes = [ 'gemm_basic.mlir' ] -config.excludes.update(local_excludes) +if(not config.imex_enable_excluded_tests): + config.excludes.update(local_excludes) diff --git a/test/Conversion/XeGPUToSPIRV/update_offset.vc.mlir b/test/Conversion/XeGPUToSPIRV/update_offset.vc.mlir index bb5905b13..2d46b0c90 100644 --- a/test/Conversion/XeGPUToSPIRV/update_offset.vc.mlir +++ b/test/Conversion/XeGPUToSPIRV/update_offset.vc.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s +// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s module @gemm attributes {gpu.container_module} { memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> diff --git a/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir b/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir index 36d438780..b51354e01 100644 --- a/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir +++ b/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir @@ -1,4 +1,4 @@ -// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s +// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=true' %s | FileCheck %s gpu.module @test attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { // CHECK: spirv.ConvertPtrToU diff --git a/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir b/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir new file mode 100644 index 000000000..1fb4c02b3 --- /dev/null +++ b/test/Integration/Dialect/XeGPU/gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir @@ -0,0 +1,139 @@ +// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm-joint-matrix.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck +// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm-joint-matrix.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck + +// NOTE: This test case provides an end-to-end example of XeGPU SIMT mode ops to SPIR-V JointMatrix ops lowering + +#sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}> +#sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}> +#sg_map_fp16_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}> +module @gemm attributes {gpu.container_module} { + func.func @test(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %A_gpu = gpu.alloc host_shared () : memref<1024x1024xf16> + memref.copy %A, %A_gpu : memref<1024x1024xf16> to memref<1024x1024xf16> + %B_gpu = gpu.alloc host_shared () : memref<1024x1024xf16> + memref.copy %B, %B_gpu : memref<1024x1024xf16> to memref<1024x1024xf16> + %C_gpu = gpu.alloc host_shared () : memref<1024x1024xf32> + memref.copy %C, %C_gpu : memref<1024x1024xf32> to memref<1024x1024xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c128, %c64, %c1) threads in (%c1, %c16, %c1) args(%A_gpu : memref<1024x1024xf16>, %B_gpu : memref<1024x1024xf16>, %C_gpu : memref<1024x1024xf32>) + gpu.dealloc %A_gpu : memref<1024x1024xf16> + gpu.dealloc %B_gpu : memref<1024x1024xf16> + return %C_gpu : memref<1024x1024xf32> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf16>, %c: memref<1024x1024xf32>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c8 : index + %n = arith.muli %block_id_y, %c16 : index + + %1 = xegpu.create_nd_tdesc %a[%m, %c0] {boundary_check = false} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> + %2 = xegpu.create_nd_tdesc %b[%c0, %n] {boundary_check = false} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> + %tmpC = arith.constant dense<0.0> : vector<8xf32> + %3 = vector.shape_cast %tmpC : vector<8xf32> to vector<8x1xf32> + %tmp0, %tmp1, %result = scf.for %k= %c0 to %c1024 step %c16 iter_args(%subA = %1, %subB = %2, %subC = %3) + -> (!xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>, !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>, vector<8x1xf32>) { + %4 = xegpu.load_nd %subA {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<8x1x1xf16> + %5 = xegpu.load_nd %subB {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16> + %6 = xegpu.dpas %4, %5, %subC : vector<8x1x1xf16>, vector<8x1x2xf16>, vector<8x1xf32> -> vector<8x1xf32> + %7 = xegpu.update_nd_offset %subA, [%c0, %c16] : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> + -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> + %8 = xegpu.update_nd_offset %subB, [%c16, %c0] : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> + -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> + scf.yield %7, %8, %6: !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>, !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>, vector<8x1xf32> + } + %9 = xegpu.create_nd_tdesc %c[%m, %n] {boundary_check = false} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> + xegpu.store_nd %result, %9 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> + gpu.return + } + } + func.func @main() attributes {llvm.emit_c_interface} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + %cf_0 = arith.constant 0.0 : f16 + %cf_1 = arith.constant 1.0 : f16 + %A = memref.alloc() : memref<1024x1024xf16> + %B = memref.alloc() : memref<1024x1024xf16> + %C = memref.alloc() : memref<1024x1024xf32> + %C_ref = memref.alloc() : memref<1024x1024xf32> + // intialize matrix A ; A[i, j] = j + scf.for %i = %c0 to %c1024 step %c1 { + scf.for %j = %c0 to %c1024 step %c1 { + %t = index.castu %j : index to i16 + %val = arith.uitofp %t : i16 to f16 + memref.store %val, %A[%i, %j] : memref<1024x1024xf16> + } + } + // make matrix B an identity matrix + scf.for %i = %c0 to %c1024 step %c1 { + scf.for %j = %c0 to %c1024 step %c1 { + %i_i32 = index.castu %i : index to i32 + %j_i32 = index.castu %j : index to i32 + %i_j_same = arith.cmpi eq, %i_i32, %j_i32 : i32 + + scf.if %i_j_same { + memref.store %cf_1, %B[%i, %j] : memref<1024x1024xf16> + } else { + memref.store %cf_0, %B[%i, %j] : memref<1024x1024xf16> + } + } + } + // intialize matrix C and C_ref ; C[i, j] = 0 + %c0_f32 = arith.constant 0.0 : f32 + scf.for %i = %c0 to %c1024 step %c1 { + scf.for %j = %c0 to %c1024 step %c1 { + memref.store %c0_f32, %C[%i, %j] : memref<1024x1024xf32> + memref.store %c0_f32, %C_ref[%i, %j] : memref<1024x1024xf32> + } + } + // compute C for reference + scf.for %i = %c0 to %c1024 step %c1 { + scf.for %j = %c0 to %c1024 step %c1 { + %c_curr = memref.load %C_ref[%i, %j] : memref<1024x1024xf32> + %c_val = scf.for %k = %c0 to %c1024 step %c1 iter_args(%c_partial = %c_curr) -> f32 { + %a_val = memref.load %A[%i, %k] : memref<1024x1024xf16> + %b_val = memref.load %B[%k, %j] : memref<1024x1024xf16> + %t = arith.mulf %a_val, %b_val : f16 + %t_cast = arith.extf %t : f16 to f32 + %c_sum = arith.addf %t_cast, %c_partial : f32 + scf.yield %c_sum : f32 + } + memref.store %c_val , %C_ref[%i, %j] : memref<1024x1024xf32> + } + } + %2 = call @test(%A, %B, %C) : (memref<1024x1024xf16>, memref<1024x1024xf16>, memref<1024x1024xf32>) -> memref<1024x1024xf32> + %cast_C = memref.cast %2 : memref<1024x1024xf32> to memref<*xf32> + %cast_C_ref = memref.cast %C_ref : memref<1024x1024xf32> to memref<*xf32> + // call @printMemrefF32(%cast_C) : (memref<*xf32>) -> () + // call @printMemrefF32(%cast_C_ref) : (memref<*xf32>) -> () + // CHECK: [ALLCLOSE: TRUE] + call @printAllcloseF32(%cast_C, %cast_C_ref) : (memref<*xf32>, memref<*xf32>) -> () + memref.dealloc %A : memref<1024x1024xf16> + memref.dealloc %B : memref<1024x1024xf16> + memref.dealloc %C : memref<1024x1024xf32> + memref.dealloc %C_ref : memref<1024x1024xf32> + return + } + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface} + func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} +} diff --git a/test/Integration/Dialect/XeGPU/lit.local.cfg b/test/Integration/Dialect/XeGPU/lit.local.cfg index e084b0d12..ab78e6edf 100644 --- a/test/Integration/Dialect/XeGPU/lit.local.cfg +++ b/test/Integration/Dialect/XeGPU/lit.local.cfg @@ -1,9 +1,11 @@ local_excludes = [ 'gemm_1024x1024xf16.mlir', 'gemm_1024x1024xf16.using.updateoffset.mlir', + 'gemm_SIMT_1024x1024x1024xf16_f16_f32.mlir', 'gemm_1024x1016x1016_f16_f16_f32.mlir', 'load2d_dpas_store2d.mlir', 'load2d-padding-f32.mlir', 'load2d-padding.mlir' ] -config.excludes.update(local_excludes) +if(not config.imex_enable_excluded_tests): + config.excludes.update(local_excludes) diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-llvm-joint-matrix.pp b/test/Integration/Dialect/XeGPU/xegpu-to-llvm-joint-matrix.pp new file mode 100644 index 000000000..e4b615146 --- /dev/null +++ b/test/Integration/Dialect/XeGPU/xegpu-to-llvm-joint-matrix.pp @@ -0,0 +1,25 @@ +// linalg dialect to gpu dialect lowering pipeline +// Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. +builtin.module( + imex-convert-gpu-to-spirv{enable-joint-matrix=true} + canonicalize + spirv.module(spirv-lower-abi-attrs + spirv-update-vce) + func.func(llvm-request-c-wrappers) + serialize-spirv + convert-vector-to-scf + convert-gpu-to-gpux + convert-scf-to-cf + convert-cf-to-llvm + convert-vector-to-llvm + convert-index-to-llvm + convert-arith-to-llvm + convert-func-to-llvm + convert-math-to-llvm + convert-gpux-to-llvm + convert-index-to-llvm + expand-strided-metadata + lower-affine + finalize-memref-to-llvm + reconcile-unrealized-casts) +// End diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp b/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp index bc7826608..72f8264b2 100644 --- a/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp +++ b/test/Integration/Dialect/XeGPU/xegpu-to-llvm.pp @@ -1,7 +1,7 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module( - imex-convert-gpu-to-spirv + imex-convert-gpu-to-spirv{enable-vc-intrinsic=true} spirv.module(spirv-lower-abi-attrs spirv-update-vce) func.func(llvm-request-c-wrappers) diff --git a/test/SPIRV/IntelVectorExtension/lit.local.cfg b/test/SPIRV/IntelVectorExtension/lit.local.cfg index 00604223a..3cc7d0832 100644 --- a/test/SPIRV/IntelVectorExtension/lit.local.cfg +++ b/test/SPIRV/IntelVectorExtension/lit.local.cfg @@ -6,4 +6,5 @@ local_excludes = ['DPAS_Dynamic_Size_BF16.mlir', 'Load_2d_raw_send.mlir', 'Store2d_raw_send.mlir', 'GEMM_4kx4kx4k_BF16.mlir'] -config.excludes.update(local_excludes) +if(not config.imex_enable_excluded_tests): + config.excludes.update(local_excludes)