Skip to content

Commit

Permalink
[LV][EVL] Support call instruction with EVL-vectorization
Browse files Browse the repository at this point in the history
  • Loading branch information
LiqinWeng committed Oct 17, 2024
1 parent 2611132 commit 35d8632
Show file tree
Hide file tree
Showing 11 changed files with 135 additions and 75 deletions.
5 changes: 5 additions & 0 deletions llvm/include/llvm/Analysis/VectorUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,11 @@ bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx);
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
const TargetLibraryInfo *TLI);

/// Returns VP intrinsic ID for call.
/// For the input call instruction it finds mapping intrinsic and returns
/// its intrinsic ID, in case it does not found it return not_intrinsic.
Intrinsic::ID getVPIntrinsicIDForCall(const CallInst *CI);

/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
Expand Down
4 changes: 2 additions & 2 deletions llvm/include/llvm/IR/VectorBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ class VectorBuilder {
const Twine &Name = Twine());

/// Emit a VP reduction intrinsic call for recurrence kind.
/// \param RdxID The intrinsic ID of llvm.vector.reduce.*
/// \param ID The intrinsic ID of call Intrinsic
/// \param ValTy The type of operand which the reduction operation is
/// performed.
/// \param VecOpArray The operand list.
Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy,
Value *createSimpleIntrinsic(Intrinsic::ID RdxID, Type *ValTy,
ArrayRef<Value *> VecOpArray,
const Twine &Name = Twine());
};
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,15 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
return Intrinsic::not_intrinsic;
}

Intrinsic::ID llvm::getVPIntrinsicIDForCall(const CallInst *CI) {
const Function *F = CI->getCalledFunction();
if (!F)
return Intrinsic::not_intrinsic;

if (F->isIntrinsic())
return VPIntrinsic::getForIntrinsic(F->getIntrinsicID());
}

/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
Expand Down
9 changes: 4 additions & 5 deletions llvm/lib/IR/VectorBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,12 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
}

Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID,
Type *ValTy,
Value *VectorBuilder::createSimpleIntrinsic(Intrinsic::ID ID, Type *ValTy,
ArrayRef<Value *> InstOpArray,
const Twine &Name) {
auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
assert(VPReductionIntrinsic::isVPReduction(VPID) &&
"No VPIntrinsic for this reduction");
auto VPID = VPIntrinsic::getForIntrinsic(ID);
assert(VPIntrinsic::isVPIntrinsic(VPID) &&
"No VPIntrinsic for this Intrinsic");
return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
}

Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,14 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
break;
}
// TODO: Need push a new patch
case Intrinsic::vp_smax:
case Intrinsic::vp_smin:
case Intrinsic::vp_umax:
case Intrinsic::vp_umin: {
// return LT.first;
return 1;
}
// vp int cast ops.
case Intrinsic::vp_trunc:
case Intrinsic::vp_zext:
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1300,7 +1300,7 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src,
Type *SrcEltTy = SrcTy->getElementType();
Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
Value *Ops[] = {Iden, Src};
return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops);
}

Value *llvm::createReduction(IRBuilderBase &B,
Expand Down Expand Up @@ -1343,7 +1343,7 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
auto *SrcTy = cast<VectorType>(Src->getType());
Value *Ops[] = {Start, Src};
return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops);
}

void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8351,7 +8351,6 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
return nullptr;

SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));

// Is it beneficial to perform intrinsic call compared to lib call?
bool ShouldUseVectorIntrinsic =
ID && LoopVectorizationPlanner::getDecisionAndClampRange(
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1708,6 +1708,20 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
/// Returns true if the intrinsic may write to memory.
bool mayWriteToMemory() const { return MayWriteToMemory; }

operand_range arg_operands() {
unsigned argNum = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)
? getNumOperands() - 1
: getNumOperands();
return make_range(op_begin(), op_begin() + argNum);
}

const_operand_range arg_operands() const {
unsigned argNum = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)
? getNumOperands() - 1
: getNumOperands();
return make_range(op_begin(), op_begin() + argNum);
}

/// Returns true if the intrinsic may have side-effects.
bool mayHaveSideEffects() const { return MayHaveSideEffects; }

Expand Down
33 changes: 24 additions & 9 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(operands())) {
for (const auto &I : enumerate(arg_operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
Expand All @@ -983,18 +983,33 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
Args.push_back(Arg);
}

// Use vector version of the intrinsic.
Module *M = State.Builder.GetInsertBlock()->getModule();
Function *VectorF =
Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");

CallInst *V = nullptr;
auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
SmallVector<OperandBundleDef, 1> OpBundles;
if (CI)
CI->getOperandBundlesAsDefs(OpBundles);

CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) {
// Use vector version of the vector predicate Intrinsic
IRBuilderBase &BuilderIR = State.Builder;
VectorBuilder VBuilder(BuilderIR);
Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
VBuilder.setMask(Mask).setEVL(
State.get(getOperand(getNumOperands() - 1), /*NeedsScalar=*/true));
auto *TyReturn = VectorType::get(getResultType(), State.VF);
Value *VPInst = VBuilder.createSimpleIntrinsic(VectorIntrinsicID, TyReturn,
Args, "vp.call");
if (VPInst) {
V = cast<CallInst>(VPInst);
}
} else {
// Use vector version of the intrinsic.
Module *M = State.Builder.GetInsertBlock()->getModule();
Function *VectorF =
Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");
V = State.Builder.CreateCall(VectorF, Args, OpBundles);
}

setFlags(V);

Expand All @@ -1013,7 +1028,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
// clear Arguments.
// TODO: Rework TTI interface to be independent of concrete IR values.
SmallVector<const Value *> Arguments;
for (const auto &[Idx, Op] : enumerate(operands())) {
for (const auto &[Idx, Op] : enumerate(arg_operands())) {
auto *V = Op->getUnderlyingValue();
if (!V) {
if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
return nullptr;
return new VPWidenEVLRecipe(*W, EVL);
})
.Case<VPWidenIntrinsicRecipe>(
[&](VPWidenIntrinsicRecipe *CInst) -> VPRecipeBase * {
auto *CI = cast<CallInst>(CInst->getUnderlyingInstr());
SmallVector<VPValue *> Ops(CInst->operands());
Ops.push_back(&EVL);
Intrinsic::ID VPID = getVPIntrinsicIDForCall(CI);
if (VPID == Intrinsic::not_intrinsic)
return nullptr;
return new VPWidenIntrinsicRecipe(
*CI, VPID, Ops, CI->getType(), CI->getDebugLoc());
})
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
Expand Down
112 changes: 56 additions & 56 deletions llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ define void @vp_smax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMAX:%.+]]> = call llvm.smax(ir<[[LD1]]>, ir<[[LD2]]>)
; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMAX:%.+]]> = call llvm.vp.smax(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[SMAX]]>, vp<[[EVL]]>
Expand All @@ -39,20 +39,20 @@ define void @vp_smax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: }

entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
%1 = load i32, ptr %arrayidx3, align 4
br label %loop

loop:
%iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
%gep = getelementptr inbounds i32, ptr %b, i64 %iv
%0 = load i32, ptr %gep, align 4
%gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
%1 = load i32, ptr %gep3, align 4
%. = tail call i32 @llvm.smax.i32(i32 %0, i32 %1)
%arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
store i32 %., ptr %arrayidx11, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond.not, label %exit, label %for.body
%gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %., ptr %gep11, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %N
br i1 %exitcond.not, label %exit, label %loop

exit:
ret void
Expand Down Expand Up @@ -80,7 +80,7 @@ define void @vp_smin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMIN:%.+]]> = call llvm.smin(ir<[[LD1]]>, ir<[[LD2]]>)
; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMIN:%.+]]> = call llvm.vp.smin(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[SMIN]]>, vp<[[EVL]]>
Expand All @@ -92,20 +92,20 @@ define void @vp_smin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: }

entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
%1 = load i32, ptr %arrayidx3, align 4
br label %loop

loop:
%iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
%gep = getelementptr inbounds i32, ptr %b, i64 %iv
%0 = load i32, ptr %gep, align 4
%gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
%1 = load i32, ptr %gep3, align 4
%. = tail call i32 @llvm.smin.i32(i32 %0, i32 %1)
%arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
store i32 %., ptr %arrayidx11, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond.not, label %exit, label %for.body
%gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %., ptr %gep11, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %N
br i1 %exitcond.not, label %exit, label %loop

exit:
ret void
Expand Down Expand Up @@ -133,7 +133,7 @@ define void @vp_umax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMAX:%.+]]> = call llvm.umax(ir<[[LD1]]>, ir<[[LD2]]>)
; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMAX:%.+]]> = call llvm.vp.umax(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UMAX]]>, vp<[[EVL]]>
Expand All @@ -145,20 +145,20 @@ define void @vp_umax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: }

entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
%1 = load i32, ptr %arrayidx3, align 4
br label %loop

loop:
%iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
%gep = getelementptr inbounds i32, ptr %b, i64 %iv
%0 = load i32, ptr %gep, align 4
%gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
%1 = load i32, ptr %gep3, align 4
%. = tail call i32 @llvm.umax.i32(i32 %0, i32 %1)
%arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
store i32 %., ptr %arrayidx11, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond.not, label %exit, label %for.body
%gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %., ptr %gep11, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %N
br i1 %exitcond.not, label %exit, label %loop

exit:
ret void
Expand Down Expand Up @@ -186,7 +186,7 @@ define void @vp_umin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMIN:%.+]]> = call llvm.umin(ir<[[LD1]]>, ir<[[LD2]]>)
; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMIN:%.+]]> = call llvm.vp.umin(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UMIN]]>, vp<[[EVL]]>
Expand All @@ -198,20 +198,20 @@ define void @vp_umin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: }

entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
%1 = load i32, ptr %arrayidx3, align 4
br label %loop

loop:
%iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
%gep = getelementptr inbounds i32, ptr %b, i64 %iv
%0 = load i32, ptr %gep, align 4
%gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
%1 = load i32, ptr %gep3, align 4
%. = tail call i32 @llvm.umin.i32(i32 %0, i32 %1)
%arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
store i32 %., ptr %arrayidx11, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond.not, label %exit, label %for.body
%gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %., ptr %gep11, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %N
br i1 %exitcond.not, label %exit, label %loop

exit:
ret void
Expand Down

0 comments on commit 35d8632

Please sign in to comment.