Skip to content

Commit

Permalink
[LV][EVL] Support call instruction with EVL-vectorization
Browse files Browse the repository at this point in the history
Only support smax/smin/umax/umin
  • Loading branch information
LiqinWeng committed Sep 30, 2024
1 parent c1621ed commit 541f273
Show file tree
Hide file tree
Showing 13 changed files with 220 additions and 22 deletions.
6 changes: 6 additions & 0 deletions llvm/include/llvm/Analysis/VectorUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,12 @@ bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx);
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
const TargetLibraryInfo *TLI);

/// Returns VP intrinsic ID for call.
/// For the input call instruction it finds mapping intrinsic and returns
/// its intrinsic ID, in case it does not found it return not_intrinsic.
Intrinsic::ID getVPIntrinsicIDForCall(const CallInst *CI,
const TargetLibraryInfo *TLI);

/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
Expand Down
4 changes: 2 additions & 2 deletions llvm/include/llvm/IR/VectorBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ class VectorBuilder {
const Twine &Name = Twine());

/// Emit a VP reduction intrinsic call for recurrence kind.
/// \param RdxID The intrinsic ID of llvm.vector.reduce.*
/// \param ID The intrinsic ID of Call Intrinsic
/// \param ValTy The type of operand which the reduction operation is
/// performed.
/// \param VecOpArray The operand list.
Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy,
Value *createSimpleIntrinsic(Intrinsic::ID ID, Type *ValTy,
ArrayRef<Value *> VecOpArray,
const Twine &Name = Twine());
};
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,13 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
return Intrinsic::not_intrinsic;
}

Intrinsic::ID llvm::getVPIntrinsicIDForCall(const CallInst *CI,
const TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getIntrinsicForCallSite(*CI, TLI);

return VPIntrinsic::getForIntrinsic(ID);
}

/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
Expand Down
9 changes: 4 additions & 5 deletions llvm/lib/IR/VectorBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,12 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
}

Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID,
Type *ValTy,
Value *VectorBuilder::createSimpleIntrinsic(Intrinsic::ID ID, Type *ValTy,
ArrayRef<Value *> InstOpArray,
const Twine &Name) {
auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
assert(VPReductionIntrinsic::isVPReduction(VPID) &&
"No VPIntrinsic for this reduction");
auto VPID = VPIntrinsic::getForIntrinsic(ID);
assert(VPIntrinsic::isVPIntrinsic(VPID) &&
"No VPIntrinsic for this Intrinsic");
return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1299,7 +1299,7 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src,
Type *SrcEltTy = SrcTy->getElementType();
Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
Value *Ops[] = {Iden, Src};
return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops);
}

Value *llvm::createReduction(IRBuilderBase &B,
Expand Down Expand Up @@ -1342,7 +1342,7 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
auto *SrcTy = cast<VectorType>(Src->getType());
Value *Ops[] = {Start, Src};
return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops);
}

void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8616,7 +8616,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
// TODO: try to put it close to addActiveLaneMask().
// Discard the plan if it is not EVL-compatible
if (CM.foldTailWithEVL() &&
!VPlanTransforms::tryAddExplicitVectorLength(*Plan))
!VPlanTransforms::tryAddExplicitVectorLength(*Plan, *TLI))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
Expand Down
100 changes: 96 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPScalarIVStepsSC:
case VPRecipeBase::VPVectorPointerSC:
case VPRecipeBase::VPWidenCallSC:
case VPRecipeBase::VPWidenCallEVLSC:
case VPRecipeBase::VPWidenCanonicalIVSC:
case VPRecipeBase::VPWidenCastSC:
case VPRecipeBase::VPWidenGEPSC:
Expand Down Expand Up @@ -1610,6 +1611,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {

/// A recipe for widening Call instructions.
class VPWidenCallRecipe : public VPSingleDefRecipe {
public:
/// ID of the vector intrinsic to call when widening the call. If set the
/// Intrinsic::not_intrinsic, a library call will be used instead.
Intrinsic::ID VectorIntrinsicID;
Expand All @@ -1619,26 +1621,48 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
/// VF with a valid variant.
Function *Variant;

public:
protected:
template <typename IterT>
VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
VPWidenCallRecipe(unsigned VPDefOpcode, Value *UV,
iterator_range<IterT> CallArguments,
Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
Function *Variant = nullptr)
: VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL),
: VPSingleDefRecipe(VPDefOpcode, CallArguments, UV, DL),
VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {
assert(
isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
"last operand must be the called function");
}

public:
template <typename IterT>
VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
Intrinsic::ID VectorIntrinsicID, DebugLoc DL)
: VPWidenCallRecipe(VPDef::VPWidenCallSC, UV, CallArguments,
VectorIntrinsicID, DL) {}

template <typename IterT>
VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
Intrinsic::ID VectorIntrinsicID, DebugLoc DL,
Function *Variant)
: VPWidenCallRecipe(VPDef::VPWidenCallSC, UV, CallArguments,
VectorIntrinsicID, DL, Variant) {}

~VPWidenCallRecipe() override = default;

VPWidenCallRecipe *clone() override {
return new VPWidenCallRecipe(getUnderlyingValue(), operands(),
VectorIntrinsicID, getDebugLoc(), Variant);
}
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPWidenCallSC ||
R->getVPDefID() == VPRecipeBase::VPWidenCallEVLSC;
}

VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
static inline bool classof(const VPUser *U) {
auto *R = dyn_cast<VPRecipeBase>(U);
return R && classof(R);
}

/// Produce a widened version of the call instruction.
void execute(VPTransformState &State) override;
Expand All @@ -1665,6 +1689,74 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
#endif
};

/// A recipe for widening Call instructions with vector-predication intrinsics
/// with explicit vector length (EVL).
class VPWidenCallEVLRecipe : public VPWidenCallRecipe {
// using VPRecipeWithIRFlags::transferFlags;
// Intrinsic::ID VectorIntrinsicID;

public:
template <typename IterT>
VPWidenCallEVLRecipe(Value *UV, iterator_range<IterT> CallArguments,
Intrinsic::ID VectorIntrinsicID, DebugLoc DL,
VPValue &EVL)
: VPWidenCallRecipe(VPDef::VPWidenCallEVLSC, UV, CallArguments,
VectorIntrinsicID, DL) {
addOperand(&EVL);
}

VPWidenCallEVLRecipe(VPWidenCallRecipe &W, Intrinsic::ID VectorIntrinsicID,
DebugLoc DL, VPValue &EVL)
: VPWidenCallEVLRecipe(W.getUnderlyingValue(), W.operands(),
VectorIntrinsicID, DL, EVL) {}

~VPWidenCallEVLRecipe() override = default;

VPWidenCallEVLRecipe *clone() override {
llvm_unreachable("VPWidenCallEVLRecipe cannot be cloned");
return nullptr;
}

VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }

// Intrinsic::ID getVectorIntrinsicID() {
// return VectorIntrinsicID;
// }

VP_CLASSOF_IMPL(VPDef::VPWidenCallEVLSC)

InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const final;

Function *getCalledScalarFunction() const {
return cast<Function>(getOperand(getNumOperands() - 2)->getLiveInIRValue());
}

operand_range arg_operands() {
return make_range(op_begin(), op_begin() + getNumOperands() - 2);
}
const_operand_range arg_operands() const {
return make_range(op_begin(), op_begin() + getNumOperands() - 2);
}
/// Produce a widened version of the call instruction.
void execute(VPTransformState &State) final;

/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// EVL in that recipe is always the last operand, thus any use before means
// the VPValue should be vectorized.
return getEVL() == Op;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const final;
#endif
};

/// A recipe representing a sequence of load -> update -> store as part of
/// a histogram operation. This means there may be aliasing between vector
/// lanes, which is handled by the llvm.experimental.vector.histogram family
Expand Down
81 changes: 81 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayWriteToMemory();
case VPWidenCallSC:
// case VPWidenCallEVLSC:
return !cast<VPWidenCallRecipe>(this)
->getCalledScalarFunction()
->onlyReadsMemory();
Expand Down Expand Up @@ -117,6 +118,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayReadFromMemory();
case VPWidenCallSC:
// case VPWidenCallEVLSC:
return !cast<VPWidenCallRecipe>(this)
->getCalledScalarFunction()
->onlyWritesMemory();
Expand Down Expand Up @@ -158,6 +160,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInstructionSC:
return mayWriteToMemory();
case VPWidenCallSC: {
// case VPWidenCallEVLSC: {
Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
}
Expand Down Expand Up @@ -951,6 +954,52 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
State.addMetadata(V, CI);
}

void VPWidenCallEVLRecipe::execute(VPTransformState &State) {
Function *CalledScalarFn = getCalledScalarFunction();
assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
State.setDebugLocFrom(getDebugLoc());

bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;

// TODO: more intrinsics to support , Now only support the
// llvm.smax/llvm.smin/llvm.umax/llvm.umin
auto *TysForDecl = VectorType::get(
CalledScalarFn->getReturnType()->getScalarType(), State.VF);

SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(arg_operands())) {
Value *Arg;
if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(
CalledScalarFn->getIntrinsicID(), I.index()))
Arg = State.get(I.value(), VPLane(0));
else
Arg = State.get(I.value());
Args.push_back(Arg);
}

IRBuilderBase &BuilderIR = State.Builder;
VectorBuilder VBuilder(BuilderIR);
Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
VBuilder.setMask(Mask).setEVL(State.get(getEVL(), /*NeedsScalar=*/true));

auto VPInst = VBuilder.createSimpleIntrinsic(VectorIntrinsicID, TysForDecl,
Args, "vp.call");
// FIXME: IR/Recipe/EVLRecipe has same the flags. Can copy from IR?
if (VPInst) {
if (auto *VecOp = dyn_cast<CallInst>(VPInst))
VecOp->copyIRFlags(getUnderlyingInstr());
}

auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
SmallVector<OperandBundleDef, 1> OpBundles;
if (CI)
CI->getOperandBundlesAsDefs(OpBundles);

State.set(this, VPInst);
State.addMetadata(VPInst, CI);
}

InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Expand Down Expand Up @@ -998,6 +1047,12 @@ InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}

// TODO: Reimplement of the computeCost
InstructionCost VPWidenCallEVLRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
return VPRecipeBase::computeCost(VF, Ctx);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
Expand Down Expand Up @@ -1115,6 +1170,32 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
}
}

void VPWidenCallEVLRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-CALL ";

Function *CalledFn = getCalledScalarFunction();
if (CalledFn->getReturnType()->isVoidTy())
O << "void ";
else {
printAsOperand(O, SlotTracker);
O << " = ";
}

O << "vp.call @" << CalledFn->getName() << "(";
interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
Op->printAsOperand(O, SlotTracker);
});
O << ")";

if (VectorIntrinsicID)
O << " (using vector intrinsic)";
else {
O << " (using library function";
O << ")";
}
}

void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
Expand Down
14 changes: 11 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1350,7 +1350,8 @@ void VPlanTransforms::addActiveLaneMask(
}

/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL,
const TargetLibraryInfo &TLI) {
SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
Expand Down Expand Up @@ -1379,6 +1380,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
return nullptr;
return new VPWidenEVLRecipe(*W, EVL);
})
.Case<VPWidenCallRecipe>([&](VPWidenCallRecipe *W) {
auto *CI = cast<CallInst>(W->getUnderlyingInstr());
Intrinsic::ID VPID = getVPIntrinsicIDForCall(CI, &TLI);
return new VPWidenCallEVLRecipe(*W, VPID, CI->getDebugLoc(),
EVL);
})
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
Expand Down Expand Up @@ -1429,7 +1436,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// ...
///
bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan,
const TargetLibraryInfo &TLI) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
Expand Down Expand Up @@ -1481,7 +1489,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
NextEVLIV->insertBefore(CanonicalIVIncrement);
EVLPhi->addOperand(NextEVLIV);

transformRecipestoEVLRecipes(Plan, *VPEVL);
transformRecipestoEVLRecipes(Plan, *VPEVL, TLI);

// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
Expand Down
Loading

0 comments on commit 541f273

Please sign in to comment.