Skip to content

Commit

Permalink
Support index element type in VectorLinearize pass
Browse files Browse the repository at this point in the history
Clone upstream patterns for constant op and vectorizable op and
remove isLessThanTargetBitWidth check to support index element type.

Remove IndexCastOp pattern from XeGPUToVC pass.
  • Loading branch information
ayzhuang committed Nov 21, 2024
1 parent 49ca730 commit 0d06825
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 68 deletions.
27 changes: 0 additions & 27 deletions lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -661,22 +661,6 @@ class VectorShapeCastPattern : public OpConversionPattern<ShapeCastOp> {
}
};

template <typename OpTy>
class IndexCastPattern : public OpConversionPattern<OpTy> {
public:
using OpConversionPattern<OpTy>::OpConversionPattern;
LogicalResult
matchAndRewrite(OpTy indexCastOp, typename OpTy::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
auto *converter = OpConversionPattern<OpTy>::getTypeConverter();
Type dstType = converter->convertType(indexCastOp.getType());
if (!dstType)
return failure();
rewriter.replaceOpWithNewOp<OpTy>(indexCastOp, dstType, adaptor.getIn());
return success();
}
};

class SCFForPattern : public OpConversionPattern<ForOp> {
public:
using OpConversionPattern<ForOp>::OpConversionPattern;
Expand Down Expand Up @@ -823,14 +807,6 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
target.addDynamicallyLegalDialect<scf::SCFDialect>(
[&](Operation *op) { return isLegalXeGPUSCFOp(op, typeConverter); });

target.addDynamicallyLegalOp<arith::IndexCastOp, arith::IndexCastUIOp>(
[&](Operation *op) {
if (auto vecTy = dyn_cast<VectorType>(op->getResult(0).getType())) {
return typeConverter.isLegal(vecTy);
}
return true;
});

target.addIllegalOp<ShapeCastOp>();

// TODO: can we change it to addDynamicLegalOp?
Expand Down Expand Up @@ -883,9 +859,6 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
patterns.add<VectorShapeCastPattern, SCFForPattern>(typeConverter,
patterns.getContext());

patterns.add<IndexCastPattern<arith::IndexCastOp>,
IndexCastPattern<arith::IndexCastUIOp>>(typeConverter,
patterns.getContext());
// Ops to llvm.genx only Patterns
patterns.add<NbarrierWaitPattern, CompilerHintPattern, DpasPattern,
NbarrierArrivePattern>(patterns.getContext());
Expand Down
123 changes: 82 additions & 41 deletions lib/Transforms/VectorLinearize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,33 +34,93 @@ namespace imex {

namespace {

// rewrite arith.constant op in form of vector<1xmxindex> into 1D form
// (vector<mxindex>)
struct ArithConstantOpConversion final
// Cloned from upstream with isLessThanTargetBitWidth check removed.
struct ConstantOpConversion final
: public mlir::OpConversionPattern<mlir::arith::ConstantOp> {
using mlir::OpConversionPattern<mlir::arith::ConstantOp>::OpConversionPattern;
using OpConversionPattern::OpConversionPattern;
mlir::LogicalResult
matchAndRewrite(mlir::arith::ConstantOp constOp, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
auto value = llvm::dyn_cast<mlir::DenseElementsAttr>(constOp.getValue());
if (!value || value.getType().getRank() != 2)
return mlir::failure();
auto type = value.getType();
auto shape = type.getShape();
auto elemTy = type.getElementType();
if (shape[0] != 1 || !elemTy.isIndex())
auto resType =
getTypeConverter()->convertType<mlir::VectorType>(constOp.getType());

if (resType.isScalable() &&
!mlir::isa<mlir::SplatElementsAttr>(constOp.getValue()))
return rewriter.notifyMatchFailure(
constOp,
"Cannot linearize a constant scalable vector that's not a splat");

if (!resType)
return rewriter.notifyMatchFailure(constOp, "can't convert return type");
auto dstElementsAttr =
mlir::dyn_cast<mlir::DenseElementsAttr>(constOp.getValue());
if (!dstElementsAttr)
return rewriter.notifyMatchFailure(constOp, "unsupported attr type");

dstElementsAttr = dstElementsAttr.reshape(resType);
rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(constOp, resType,
dstElementsAttr);
return mlir::success();
}
};

// Cloned from upstream with isLessThanTargetBitWidth check removed.
struct VectorizableOpConversion final
: public mlir::OpTraitConversionPattern<mlir::OpTrait::Vectorizable> {
using OpTraitConversionPattern::OpTraitConversionPattern;
mlir::LogicalResult
matchAndRewrite(mlir::Operation *op, llvm::ArrayRef<mlir::Value> operands,
mlir::ConversionPatternRewriter &rewriter) const override {
mlir::FailureOr<mlir::Operation *> newOp =
convertOpResultTypes(op, operands, *getTypeConverter(), rewriter);
if (failed(newOp))
return mlir::failure();
auto newTy = mlir::VectorType::get({shape[1]}, elemTy);
value = value.reshape(newTy);
auto newOp =
rewriter.create<mlir::arith::ConstantOp>(constOp.getLoc(), value);
auto castOp = rewriter.create<mlir::vector::ShapeCastOp>(constOp.getLoc(),
type, newOp);
rewriter.replaceOp(constOp, castOp);

rewriter.replaceOp(op, (*newOp)->getResults());
return mlir::success();
}
};

// Cloned from upstream with isLessThanTargetBitWidth check removed.
static void populateVectorLinearizeTypeConversionsAndLegality(
mlir::TypeConverter &typeConverter, mlir::RewritePatternSet &patterns,
mlir::ConversionTarget &target) {

typeConverter.addConversion(
[](mlir::VectorType type) -> std::optional<mlir::Type> {
if (!mlir::vector::isLinearizableVector(type))
return type;

return mlir::VectorType::get(type.getNumElements(),
type.getElementType(), type.isScalable());
});

auto materializeCast = [](mlir::OpBuilder &builder, mlir::Type type,
mlir::ValueRange inputs,
mlir::Location loc) -> mlir::Value {
if (inputs.size() != 1 ||
!mlir::isa<mlir::VectorType>(inputs.front().getType()) ||
!mlir::isa<mlir::VectorType>(type))
return nullptr;

return builder.create<mlir::vector::ShapeCastOp>(loc, type, inputs.front());
};
typeConverter.addArgumentMaterialization(materializeCast);
typeConverter.addSourceMaterialization(materializeCast);
typeConverter.addTargetMaterialization(materializeCast);
target.markUnknownOpDynamicallyLegal(
[=](mlir::Operation *op) -> std::optional<bool> {
if ((mlir::isa<mlir::arith::ConstantOp>(op) ||
op->hasTrait<mlir::OpTrait::Vectorizable>())) {
return typeConverter.isLegal(op);
}
return std::nullopt;
});

patterns.add<ConstantOpConversion, VectorizableOpConversion>(
typeConverter, patterns.getContext());
}

struct VectorLoadOpConversion final
: public mlir::OpConversionPattern<mlir::vector::LoadOp> {
using mlir::OpConversionPattern<mlir::vector::LoadOp>::OpConversionPattern;
Expand Down Expand Up @@ -513,38 +573,19 @@ struct VectorLinearizePass final
return (op && op.getAggregate().getType().getRank() == 1);
});

// borrowed from upstream with hacking for index type. Currently
// we only target vector<1xmxindex> to vector<mxindex> conversion. It is
// unclear whether others are valid or not; thus they are left untouched.
target.addDynamicallyLegalOp<mlir::arith::ConstantOp>(
[&](mlir::arith::ConstantOp op) -> bool {
auto vecTy = mlir::dyn_cast<mlir::VectorType>(op.getType());
if (!vecTy || vecTy.getRank() == 0)
return true;

auto elemTy = vecTy.getElementType();
if (elemTy.isIndex()) {
if (vecTy.getRank() == 2 && vecTy.getShape()[0] == 1)
return false;
return true;
}
return !mlir::vector::isLinearizableVector(vecTy);
});

patterns.add<VectorExtractStridedSliceConversion, VectorShffleOpConversion,
VectorExtractOpConversion, VectorInsertOpConversion,
VectorSplatOpConversion, VectorLoadOpConversion,
VectorStoreOpConversion, VectorCreateMaskOpConversion,
ArithConstantOpConversion>(typeConverter, context);
VectorStoreOpConversion, VectorCreateMaskOpConversion>(
typeConverter, context);

// Shuffle16x16 will fallback to Shuffle1D for non 16x16 sizes.
mlir::vector::populateVectorTransposeLoweringPatterns(
patterns,
mlir::vector::VectorTransformsOptions().setVectorTransposeLowering(
mlir::vector::VectorTransposeLowering::Shuffle16x16));
unsigned targetVectBitWidth = std::numeric_limits<unsigned>::max();
mlir::vector::populateVectorLinearizeTypeConversionsAndLegality(
typeConverter, patterns, target, targetVectBitWidth);
populateVectorLinearizeTypeConversionsAndLegality(typeConverter, patterns,
target);
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
std::move(patterns))))
return signalPassFailure();
Expand Down
92 changes: 92 additions & 0 deletions test/Transforms/vector-linearize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,95 @@ func.func @test_vector_store_load_4x4(%buffer: memref<4x4xf32>) {
vector.store %0, %buffer[%c0, %c0] : memref<4x4xf32>, vector<4x4xf32>
return
}

// -----
// CHECK-LABEL: @test_linearize_index
// CHECK-SAME: (%[[ARG0:.*]]: vector<2x2xindex>, %[[ARG1:.*]]: vector<2x2xi32>) -> vector<2x2xindex> {
// CHECK: %[[T0:.*]] = vector.shape_cast %[[ARG1]] : vector<2x2xi32> to vector<4xi32>
// CHECK: %[[T1:.*]] = vector.shape_cast %[[ARG0]] : vector<2x2xindex> to vector<4xindex>
// CHECK: %[[CST:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
// CHECK: %[[T2:.*]] = arith.addi %[[T1]], %[[CST]] : vector<4xindex>
// CHECK: %[[T3:.*]] = arith.index_cast %[[T2]] : vector<4xindex> to vector<4xi32>
// CHECK: %[[T4:.*]] = arith.muli %[[T3]], %[[T0]] : vector<4xi32>
// CHECK: %[[T5:.*]] = arith.index_cast %[[T4]] : vector<4xi32> to vector<4xindex>
// CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<4xindex> to vector<2x2xindex>
// CHECK: return %[[T6]] : vector<2x2xindex>
func.func @test_linearize_index(%arg0: vector<2x2xindex>, %arg1: vector<2x2xi32>) -> vector<2x2xindex> {
%0 = arith.constant dense<[[0, 1], [2, 3]]> : vector<2x2xindex>
// Arith and math ops are handled in generic way, check some of them
%1 = arith.addi %arg0, %0 : vector<2x2xindex>
%2 = arith.index_cast %1 : vector<2x2xindex> to vector<2x2xi32>
%3 = arith.muli %2, %arg1 : vector<2x2xi32>
%4 = arith.index_cast %3 : vector<2x2xi32> to vector<2x2xindex>
return %4 : vector<2x2xindex>
}

// -----
// CHECK-LABEL: @add_kernel_f32
// CHECK: %[[CST0:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
// CHECK: %[[CST1:.*]] = arith.constant dense<[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]> : vector<16xindex>
// CHECK: %[[T0:.*]] = vector.splat %{{.*}} : vector<16xindex>
// CHECK: %[[T1:.*]] = arith.addi %[[T0]], %[[CST0]] : vector<16xindex>
// CHECK: %[[T2:.*]] = arith.addi %[[T0]], %[[CST1]] : vector<16xindex>
// CHECK: %[[T3:.*]] = arith.index_cast %[[T1]] : vector<16xindex> to vector<16xi32>
// CHECK: %[[T4:.*]] = arith.index_cast %[[T2]] : vector<16xindex> to vector<16xi32>
// CHECK: %[[T5:.*]] = vector.splat %{{.*}} : vector<16xi32>
// CHECK: %[[T6:.*]] = arith.addi %[[T5]], %[[T3]] : vector<16xi32>
// CHECK: %[[T7:.*]] = arith.addi %[[T5]], %[[T4]] : vector<16xi32>
// CHECK: %[[T8:.*]] = arith.index_cast %[[T6]] : vector<16xi32> to vector<16xindex>
// CHECK: %[[T9:.*]] = arith.index_cast %[[T7]] : vector<16xi32> to vector<16xindex>
gpu.module @add_kernel_f32 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @add_kernel_f32(%arg0: memref<*xf32>, %arg1: memref<*xf32>, %arg2: memref<*xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 32, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%cst = arith.constant dense<true> : vector<16xi1>
%c32 = arith.constant 32 : index
%c1024_i32 = arith.constant 1024 : i32
%cst_0 = arith.constant dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
%cst_1 = arith.constant dense<[[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]]> : vector<1x16xindex>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%block_dim_y = gpu.block_dim y
%0 = arith.muli %thread_id_x, %block_dim_y : index
%1 = arith.addi %0, %thread_id_y : index
%cast = memref.cast %arg0 : memref<*xf32> to memref<?xf32>
%cast_2 = memref.cast %arg1 : memref<*xf32> to memref<?xf32>
%cast_3 = memref.cast %arg2 : memref<*xf32> to memref<?xf32>
%2 = arith.remsi %1, %c32 : index
%3 = arith.muli %2, %c32 : index
%4 = vector.splat %3 : vector<1x16xindex>
%5 = arith.addi %4, %cst_0 : vector<1x16xindex>
%6 = arith.addi %4, %cst_1 : vector<1x16xindex>
%7 = arith.index_cast %5 : vector<1x16xindex> to vector<1x16xi32>
%8 = arith.index_cast %6 : vector<1x16xindex> to vector<1x16xi32>
%block_id_x = gpu.block_id x
%9 = arith.index_cast %block_id_x : index to i32
%10 = arith.muli %9, %c1024_i32 : i32
%11 = vector.splat %10 : vector<1x16xi32>
%12 = arith.addi %11, %7 : vector<1x16xi32>
%13 = arith.addi %11, %8 : vector<1x16xi32>
%14 = arith.index_cast %12 : vector<1x16xi32> to vector<1x16xindex>
%15 = arith.index_cast %13 : vector<1x16xi32> to vector<1x16xindex>
%16 = vector.shape_cast %14 : vector<1x16xindex> to vector<16xindex>
%17 = xegpu.create_tdesc %cast, %16 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>
%18 = vector.shape_cast %15 : vector<1x16xindex> to vector<16xindex>
%19 = xegpu.create_tdesc %cast, %18 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>
%20 = xegpu.load %17, %cst <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>, vector<16xi1> -> vector<16xf32>
%21 = vector.shape_cast %20 : vector<16xf32> to vector<1x16xf32>
%22 = xegpu.load %19, %cst <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>, vector<16xi1> -> vector<16xf32>
%23 = vector.shape_cast %22 : vector<16xf32> to vector<1x16xf32>
%24 = xegpu.create_tdesc %cast_2, %16 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>
%25 = xegpu.create_tdesc %cast_2, %18 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>
%26 = xegpu.load %24, %cst <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>, vector<16xi1> -> vector<16xf32>
%27 = vector.shape_cast %26 : vector<16xf32> to vector<1x16xf32>
%28 = xegpu.load %25, %cst <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>, vector<16xi1> -> vector<16xf32>
%29 = vector.shape_cast %28 : vector<16xf32> to vector<1x16xf32>
%30 = arith.addf %21, %27 : vector<1x16xf32>
%31 = arith.addf %23, %29 : vector<1x16xf32>
%32 = xegpu.create_tdesc %cast_3, %16 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>
%33 = xegpu.create_tdesc %cast_3, %18 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>
%34 = vector.shape_cast %30 : vector<1x16xf32> to vector<16xf32>
xegpu.store %34, %32, %cst <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>, vector<16xi1>
%35 = vector.shape_cast %31 : vector<1x16xf32> to vector<16xf32>
xegpu.store %35, %33, %cst <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space = global, chunk_size = 1 : i64>>, vector<16xi1>
gpu.return
}
}

0 comments on commit 0d06825

Please sign in to comment.