From 02ee96eca90741031a26f0f06cd48bb0ba558d1a Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 23 Sep 2024 09:56:37 +0100 Subject: [PATCH] [Analysis] Teach isDereferenceableAndAlignedInLoop about SCEV predicates (#106562) Currently if a loop contains loads that we can prove at compile time are dereferenceable when certain conditions are satisfied the function isDereferenceableAndAlignedInLoop will still return false because getSmallConstantMaxTripCount will return 0 when SCEV predicates are required. This patch changes getSmallConstantMaxTripCount to take an optional Predicates pointer argument so that we can permit functions such as isDereferenceableAndAlignedInLoop to consider more cases. --- llvm/include/llvm/Analysis/Loads.h | 14 +- llvm/include/llvm/Analysis/ScalarEvolution.h | 20 ++- llvm/lib/Analysis/Loads.cpp | 17 ++- llvm/lib/Analysis/ScalarEvolution.cpp | 52 +++++-- .../Vectorize/LoopVectorizationLegality.cpp | 12 +- .../ScalarEvolution/exit-count-non-strict.ll | 6 + .../ScalarEvolution/finite-trip-count.ll | 6 + .../Analysis/ScalarEvolution/ne-overflow.ll | 3 + .../ScalarEvolution/predicated-exit-count.ll | 4 + ...cated-symbolic-max-backedge-taken-count.ll | 6 + .../trip-count-implied-addrec.ll | 15 +++ .../LoopVectorize/load-deref-pred-align.ll | 127 ++++++++++++++++++ .../LoopVectorize/simple_early_exit.ll | 8 +- 13 files changed, 256 insertions(+), 34 deletions(-) diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h index 1f01ff7027fa9a..639070c07897b0 100644 --- a/llvm/include/llvm/Analysis/Loads.h +++ b/llvm/include/llvm/Analysis/Loads.h @@ -27,6 +27,8 @@ class LoadInst; class Loop; class MemoryLocation; class ScalarEvolution; +class SCEVPredicate; +template class SmallVectorImpl; class TargetLibraryInfo; /// Return true if this is always a dereferenceable pointer. If the context @@ -81,14 +83,16 @@ bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, /// that required by the header itself and could be hoisted into the header /// if desired.) This is more powerful than the variants above when the /// address loaded from is analyzeable by SCEV. -bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, - ScalarEvolution &SE, DominatorTree &DT, - AssumptionCache *AC = nullptr); +bool isDereferenceableAndAlignedInLoop( + LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache *AC = nullptr, + SmallVectorImpl *Predicates = nullptr); /// Return true if the loop \p L cannot fault on any iteration and only /// contains read-only memory accesses. -bool isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC); +bool isDereferenceableReadOnlyLoop( + Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + SmallVectorImpl *Predicates = nullptr); /// Return true if we know that executing a load from this value cannot trap. /// diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 44fb249d584d88..68b860725752d0 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -823,8 +823,11 @@ class ScalarEvolution { /// Returns the upper bound of the loop trip count as a normal unsigned /// value. - /// Returns 0 if the trip count is unknown or not constant. - unsigned getSmallConstantMaxTripCount(const Loop *L); + /// Returns 0 if the trip count is unknown, not constant or requires + /// SCEV predicates and \p Predicates is nullptr. + unsigned getSmallConstantMaxTripCount( + const Loop *L, + SmallVectorImpl *Predicates = nullptr); /// Returns the largest constant divisor of the trip count as a normal /// unsigned value, if possible. This means that the actual trip count is @@ -905,6 +908,13 @@ class ScalarEvolution { return getBackedgeTakenCount(L, ConstantMaximum); } + /// Similar to getConstantMaxBackedgeTakenCount, except it will add a set of + /// SCEV predicates to Predicates that are required to be true in order for + /// the answer to be correct. Predicates can be checked with run-time + /// checks and can be used to perform loop versioning. + const SCEV *getPredicatedConstantMaxBackedgeTakenCount( + const Loop *L, SmallVectorImpl &Predicates); + /// When successful, this returns a SCEV that is greater than or equal /// to (i.e. a "conservative over-approximation") of the value returend by /// getBackedgeTakenCount. If such a value cannot be computed, it returns the @@ -1506,7 +1516,7 @@ class ScalarEvolution { /// Expression indicating the least constant maximum backedge-taken count of /// the loop that is known, or a SCEVCouldNotCompute. This expression is - /// only valid if the redicates associated with all loop exits are true. + /// only valid if the predicates associated with all loop exits are true. const SCEV *ConstantMax = nullptr; /// Indicating if \c ExitNotTaken has an element for every exiting block in @@ -1585,7 +1595,9 @@ class ScalarEvolution { } /// Get the constant max backedge taken count for the loop. - const SCEV *getConstantMax(ScalarEvolution *SE) const; + const SCEV *getConstantMax( + ScalarEvolution *SE, + SmallVectorImpl *Predicates = nullptr) const; /// Get the constant max backedge taken count for the particular loop exit. const SCEV *getConstantMax( diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 11f3807ffacf6e..f4b202791a7081 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -276,10 +276,9 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) { return false; } -bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, - ScalarEvolution &SE, - DominatorTree &DT, - AssumptionCache *AC) { +bool llvm::isDereferenceableAndAlignedInLoop( + LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache *AC, SmallVectorImpl *Predicates) { auto &DL = LI->getDataLayout(); Value *Ptr = LI->getPointerOperand(); @@ -304,7 +303,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, if (!Step) return false; - auto TC = SE.getSmallConstantMaxTripCount(L); + auto TC = SE.getSmallConstantMaxTripCount(L, Predicates); if (!TC) return false; @@ -810,13 +809,13 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To, return isPointerAlwaysReplaceable(From, To, DL); } -bool llvm::isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE, - DominatorTree *DT, - AssumptionCache *AC) { +bool llvm::isDereferenceableReadOnlyLoop( + Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + SmallVectorImpl *Predicates) { for (BasicBlock *BB : L->blocks()) { for (Instruction &I : *BB) { if (auto *LI = dyn_cast(&I)) { - if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC)) + if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates)) return false; } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) return false; diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 1d3443588ce60d..233f8edca5b13b 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -8191,10 +8191,13 @@ ScalarEvolution::getSmallConstantTripCount(const Loop *L, return getConstantTripCount(ExitCount); } -unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) { +unsigned ScalarEvolution::getSmallConstantMaxTripCount( + const Loop *L, SmallVectorImpl *Predicates) { + const auto *MaxExitCount = - dyn_cast(getConstantMaxBackedgeTakenCount(L)); - return getConstantTripCount(MaxExitCount); + Predicates ? getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates) + : getConstantMaxBackedgeTakenCount(L); + return getConstantTripCount(dyn_cast(MaxExitCount)); } unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) { @@ -8303,6 +8306,11 @@ const SCEV *ScalarEvolution::getPredicatedSymbolicMaxBackedgeTakenCount( return getPredicatedBackedgeTakenInfo(L).getSymbolicMax(L, this, &Preds); } +const SCEV *ScalarEvolution::getPredicatedConstantMaxBackedgeTakenCount( + const Loop *L, SmallVectorImpl &Preds) { + return getPredicatedBackedgeTakenInfo(L).getConstantMax(this, &Preds); +} + bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) { return getBackedgeTakenInfo(L).isConstantMaxOrZero(this); } @@ -8624,15 +8632,19 @@ ScalarEvolution::BackedgeTakenInfo::getExitNotTaken( } /// getConstantMax - Get the constant max backedge taken count for the loop. -const SCEV * -ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const { - auto PredicateNotAlwaysTrue = [](const ExitNotTakenInfo &ENT) { - return !ENT.hasAlwaysTruePredicate(); - }; - - if (!getConstantMax() || any_of(ExitNotTaken, PredicateNotAlwaysTrue)) +const SCEV *ScalarEvolution::BackedgeTakenInfo::getConstantMax( + ScalarEvolution *SE, + SmallVectorImpl *Predicates) const { + if (!getConstantMax()) return SE->getCouldNotCompute(); + for (const auto &ENT : ExitNotTaken) + if (!ENT.hasAlwaysTruePredicate()) { + if (!Predicates) + return SE->getCouldNotCompute(); + append_range(*Predicates, ENT.Predicates); + } + assert((isa(getConstantMax()) || isa(getConstantMax())) && "No point in having a non-constant max backedge taken count!"); @@ -13749,8 +13761,28 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE, for (const auto *P : Preds) P->print(OS, 4); } + Preds.clear(); + auto *PredConstantMax = + SE->getPredicatedConstantMaxBackedgeTakenCount(L, Preds); + if (PredConstantMax != ConstantBTC) { + assert(!Preds.empty() && + "different predicated constant max BTC but no predicates"); + OS << "Loop "; + L->getHeader()->printAsOperand(OS, /*PrintType=*/false); + OS << ": "; + if (!isa(PredConstantMax)) { + OS << "Predicated constant max backedge-taken count is "; + PrintSCEVWithTypeHint(OS, PredConstantMax); + } else + OS << "Unpredictable predicated constant max backedge-taken count."; + OS << "\n"; + OS << " Predicates:\n"; + for (const auto *P : Preds) + P->print(OS, 4); + } Preds.clear(); + auto *PredSymbolicMax = SE->getPredicatedSymbolicMaxBackedgeTakenCount(L, Preds); if (SymbolicBTC != PredSymbolicMax) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index a4787483813a9a..b767372a56b914 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1334,11 +1334,17 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // we restrict this to loads; stores are more complicated due to // concurrency restrictions. ScalarEvolution &SE = *PSE.getSE(); + SmallVector Predicates; for (Instruction &I : *BB) { LoadInst *LI = dyn_cast(&I); + // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so + // that it will consider loops that need guarding by SCEV checks. The + // vectoriser will generate these checks if we decide to vectorise. if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) && - isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC)) + isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC, + &Predicates)) SafePointers.insert(LI->getPointerOperand()); + Predicates.clear(); } } @@ -1564,7 +1570,9 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { "Expected latch predecessor to be the early exiting block"); // TODO: Handle loops that may fault. - if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC)) { + Predicates.clear(); + if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, + &Predicates)) { reportVectorizationFailure( "Loop may fault", "Cannot vectorize potentially faulting early exit loop", diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll index 6d64f76494638f..f7a18c77a82c8f 100644 --- a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll +++ b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll @@ -109,6 +109,9 @@ define void @ule_from_zero_no_nuw(i32 %M, i32 %N) { ; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,1}<%loop> Added Flags: +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4294967295 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%loop> Added Flags: ; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,1}<%loop> Added Flags: @@ -238,6 +241,9 @@ define void @sle_from_int_min_no_nsw(i32 %M, i32 %N) { ; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {-2147483648,+,1}<%loop> Added Flags: +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4294967295 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {-2147483648,+,1}<%loop> Added Flags: ; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {-2147483648,+,1}<%loop> Added Flags: diff --git a/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll b/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll index 471954f44311d4..a1538fd78ba17d 100644 --- a/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll +++ b/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll @@ -59,6 +59,9 @@ define void @sle_pre_inc_infinite(i32 %len) { ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (0 smax (1 + (sext i32 %len to i64))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i64 2147483648 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (0 smax (1 + (sext i32 %len to i64))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: @@ -130,6 +133,9 @@ define void @ule_pre_inc_infinite(i32 %len) { ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (1 + (zext i32 %len to i64)) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i64 4294967296 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (1 + (zext i32 %len to i64)) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: diff --git a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll index 49288c85897fd9..3022281658a75f 100644 --- a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll +++ b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll @@ -240,6 +240,9 @@ define void @test_zext(i64 %N) mustprogress { ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (%N /u 2) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (%N /u 2) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,2}<%for.body> Added Flags: diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-exit-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-exit-count.ll index de214183710ab3..3b398d422e36a3 100644 --- a/llvm/test/Analysis/ScalarEvolution/predicated-exit-count.ll +++ b/llvm/test/Analysis/ScalarEvolution/predicated-exit-count.ll @@ -30,6 +30,10 @@ define i32 @multiple_exits_with_predicates(ptr %src1, ptr readonly %src2, i32 %e ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-EMPTY: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i32 1023 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: +; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (1023 umin (-1 + (1 umax %end))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll index 2ec6158e9b0920..ee6052685b43b5 100644 --- a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll +++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll @@ -20,6 +20,9 @@ define void @test1(i64 %x, ptr %a, ptr %b) { ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%header> Added Flags: ; CHECK-EMPTY: +; CHECK-NEXT: Loop %header: Predicated constant max backedge-taken count is i64 -2 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: ; CHECK-NEXT: Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x)) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%header> Added Flags: @@ -71,6 +74,9 @@ define void @test2(i64 %x, ptr %a) { ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%header> Added Flags: ; CHECK-EMPTY: +; CHECK-NEXT: Loop %header: Predicated constant max backedge-taken count is i64 -2 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: ; CHECK-NEXT: Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x)) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%header> Added Flags: diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll index b313842ad5e1a9..2ee2ec53f6c9e9 100644 --- a/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll +++ b/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll @@ -61,6 +61,9 @@ define void @nw_implies_nsw(i16 %n) mustprogress { ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (128 + (-128 smax %n)) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {-128,+,1}<%for.body> Added Flags: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i16 -32641 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {-128,+,1}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (128 + (-128 smax %n)) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {-128,+,1}<%for.body> Added Flags: @@ -110,6 +113,9 @@ define void @actually_infinite() { ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is i16 257 ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i16 257 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is i16 257 ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: @@ -138,6 +144,9 @@ define void @rhs_mustexit_1(i16 %n.raw) mustprogress { ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16)))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i16 -2 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16)))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: @@ -266,6 +275,9 @@ define void @neg_rhs_maybe_infinite(i16 %n.raw) { ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16)))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i16 -2 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16)))) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: @@ -391,6 +403,9 @@ define void @ult_constant_rhs_stride2_neg(i16 %n.raw, i8 %start) { ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is ((256 + (-1 * (zext i8 (2 + %start) to i16))) /u 2) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {(2 + %start),+,2}<%for.body> Added Flags: +; CHECK-NEXT: Loop %for.body: Predicated constant max backedge-taken count is i16 128 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {(2 + %start),+,2}<%for.body> Added Flags: ; CHECK-NEXT: Loop %for.body: Predicated symbolic max backedge-taken count is ((256 + (-1 * (zext i8 (2 + %start) to i16))) /u 2) ; CHECK-NEXT: Predicates: ; CHECK-NEXT: {(2 + %start),+,2}<%for.body> Added Flags: diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index a7c9a18127ade5..1ef01e3b793d5b 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -296,3 +296,130 @@ latch: loop_exit: ret i8 %accum.next } + + +define i32 @loop_requires_scev_predicate(ptr %dest, i32 %end) { +; CHECK-LABEL: @loop_requires_scev_predicate( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: call void @init(ptr [[P1]]) +; CHECK-NEXT: call void @init(ptr [[P2]]) +; CHECK-NEXT: [[END_CLAMPED:%.*]] = and i32 [[END:%.*]], 1023 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[END]] to i10 +; CHECK-NEXT: [[TMP1:%.*]] = zext i10 [[TMP0]] to i64 +; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1) +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[UMAX]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; CHECK-NEXT: [[TMP4:%.*]] = add i8 1, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i8 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP2]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i8 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE5:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P2]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x i32>, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DEST:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP16]], [[TMP17]] +; CHECK-NEXT: store i32 [[TMP18]], ptr [[TMP15]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5]] +; CHECK: pred.store.if4: +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]] +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP21]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]] +; CHECK: pred.store.continue5: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[DOWORK:%.*]] = icmp ne i32 [[TMP26]], 0 +; CHECK-NEXT: br i1 [[DOWORK]], label [[FOR_DOWORK:%.*]], label [[FOR_INC]] +; CHECK: for.dowork: +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[GEP_IND]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IND_NEXT]] = add i8 [[IND]], 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32 +; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %p1 = alloca [1024 x i32] + %p2 = alloca [1024 x i32] + call void @init(ptr %p1) + call void @init(ptr %p2) + %end.clamped = and i32 %end, 1023 + br label %for.body + +for.body: + %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ] + %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %gep.ind + %0 = load i32, ptr %arrayidx, align 4 + %dowork = icmp ne i32 %0, 0 + br i1 %dowork, label %for.dowork, label %for.inc + +for.dowork: + %arrayidx3 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind + %1 = load i32, ptr %arrayidx3, align 4 + %add = add i32 %0, %1 + %arrayidx5 = getelementptr inbounds i32, ptr %dest, i64 %gep.ind + store i32 %add, ptr %arrayidx5, align 4 + br label %for.inc + +for.inc: + %ind.next = add i8 %ind, 1 + %conv = zext i8 %ind.next to i32 + %gep.ind.next = add i64 %gep.ind, 1 + %cmp = icmp ult i32 %conv, %end.clamped + br i1 %cmp, label %for.body, label %exit + +exit: + ret i32 0 +} diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll index dcf5c9d8ac64d1..936c07b4853a38 100644 --- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll @@ -1621,12 +1621,12 @@ loop.end: ; The form of the induction variables requires SCEV predicates. -; TODO: We should fix isDereferenceableAndAlignedInLoop and -; getSmallConstantMaxTripCount to cope with SCEV predicates when -; requesting the small constant max trip count. define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check' -; DEBUG: LV: Not vectorizing: Loop may fault. +; DEBUG: LV: Found an early exit. Retrying with speculative exit count. +; DEBUG-NEXT: LV: Found speculative backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32))) +; DEBUG-NEXT: LV: We can vectorize this loop! +; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported. ; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check( ; CHECK-SAME: i32 [[END:%.*]]) { ; CHECK-NEXT: entry: