From 02ee96eca90741031a26f0f06cd48bb0ba558d1a Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 23 Sep 2024 09:56:37 +0100
Subject: [PATCH] [Analysis] Teach isDereferenceableAndAlignedInLoop about SCEV
 predicates (#106562)

Currently if a loop contains loads that we can prove at compile time
are dereferenceable when certain conditions are satisfied the function
isDereferenceableAndAlignedInLoop will still return false because
getSmallConstantMaxTripCount will return 0 when SCEV predicates
are required. This patch changes getSmallConstantMaxTripCount to take
an optional Predicates pointer argument so that we can permit
functions such as isDereferenceableAndAlignedInLoop to consider more
cases.
---
 llvm/include/llvm/Analysis/Loads.h            |  14 +-
 llvm/include/llvm/Analysis/ScalarEvolution.h  |  20 ++-
 llvm/lib/Analysis/Loads.cpp                   |  17 ++-
 llvm/lib/Analysis/ScalarEvolution.cpp         |  52 +++++--
 .../Vectorize/LoopVectorizationLegality.cpp   |  12 +-
 .../ScalarEvolution/exit-count-non-strict.ll  |   6 +
 .../ScalarEvolution/finite-trip-count.ll      |   6 +
 .../Analysis/ScalarEvolution/ne-overflow.ll   |   3 +
 .../ScalarEvolution/predicated-exit-count.ll  |   4 +
 ...cated-symbolic-max-backedge-taken-count.ll |   6 +
 .../trip-count-implied-addrec.ll              |  15 +++
 .../LoopVectorize/load-deref-pred-align.ll    | 127 ++++++++++++++++++
 .../LoopVectorize/simple_early_exit.ll        |   8 +-
 13 files changed, 256 insertions(+), 34 deletions(-)
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 1f01ff7027fa9a..639070c07897b0 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -27,6 +27,8 @@ class LoadInst;
 class Loop;
 class MemoryLocation;
 class ScalarEvolution;
+class SCEVPredicate;
+template <typename T> class SmallVectorImpl;
 class TargetLibraryInfo;
 
 /// Return true if this is always a dereferenceable pointer. If the context
@@ -81,14 +83,16 @@ bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size,
 /// that required by the header itself and could be hoisted into the header
 /// if desired.)  This is more powerful than the variants above when the
 /// address loaded from is analyzeable by SCEV.
-bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
-                                       ScalarEvolution &SE, DominatorTree &DT,
-                                       AssumptionCache *AC = nullptr);
+bool isDereferenceableAndAlignedInLoop(
+    LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+    AssumptionCache *AC = nullptr,
+    SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
 
 /// Return true if the loop \p L cannot fault on any iteration and only
 /// contains read-only memory accesses.
-bool isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE,
-                                   DominatorTree *DT, AssumptionCache *AC);
+bool isDereferenceableReadOnlyLoop(
+    Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
 
 /// Return true if we know that executing a load from this value cannot trap.
 ///
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 44fb249d584d88..68b860725752d0 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -823,8 +823,11 @@ class ScalarEvolution {
 
   /// Returns the upper bound of the loop trip count as a normal unsigned
   /// value.
-  /// Returns 0 if the trip count is unknown or not constant.
-  unsigned getSmallConstantMaxTripCount(const Loop *L);
+  /// Returns 0 if the trip count is unknown, not constant or requires
+  /// SCEV predicates and \p Predicates is nullptr.
+  unsigned getSmallConstantMaxTripCount(
+      const Loop *L,
+      SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
 
   /// Returns the largest constant divisor of the trip count as a normal
   /// unsigned value, if possible. This means that the actual trip count is
@@ -905,6 +908,13 @@ class ScalarEvolution {
     return getBackedgeTakenCount(L, ConstantMaximum);
   }
 
+  /// Similar to getConstantMaxBackedgeTakenCount, except it will add a set of
+  /// SCEV predicates to Predicates that are required to be true in order for
+  /// the answer to be correct. Predicates can be checked with run-time
+  /// checks and can be used to perform loop versioning.
+  const SCEV *getPredicatedConstantMaxBackedgeTakenCount(
+      const Loop *L, SmallVectorImpl<const SCEVPredicate *> &Predicates);
+
   /// When successful, this returns a SCEV that is greater than or equal
   /// to (i.e. a "conservative over-approximation") of the value returend by
   /// getBackedgeTakenCount.  If such a value cannot be computed, it returns the
@@ -1506,7 +1516,7 @@ class ScalarEvolution {
 
     /// Expression indicating the least constant maximum backedge-taken count of
     /// the loop that is known, or a SCEVCouldNotCompute. This expression is
-    /// only valid if the redicates associated with all loop exits are true.
+    /// only valid if the predicates associated with all loop exits are true.
     const SCEV *ConstantMax = nullptr;
 
     /// Indicating if \c ExitNotTaken has an element for every exiting block in
@@ -1585,7 +1595,9 @@ class ScalarEvolution {
     }
 
     /// Get the constant max backedge taken count for the loop.
-    const SCEV *getConstantMax(ScalarEvolution *SE) const;
+    const SCEV *getConstantMax(
+        ScalarEvolution *SE,
+        SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr) const;
 
     /// Get the constant max backedge taken count for the particular loop exit.
     const SCEV *getConstantMax(
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 11f3807ffacf6e..f4b202791a7081 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -276,10 +276,9 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
   return false;
 }
 
-bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
-                                             ScalarEvolution &SE,
-                                             DominatorTree &DT,
-                                             AssumptionCache *AC) {
+bool llvm::isDereferenceableAndAlignedInLoop(
+    LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+    AssumptionCache *AC, SmallVectorImpl<const SCEVPredicate *> *Predicates) {
   auto &DL = LI->getDataLayout();
   Value *Ptr = LI->getPointerOperand();
 
@@ -304,7 +303,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
   if (!Step)
     return false;
 
-  auto TC = SE.getSmallConstantMaxTripCount(L);
+  auto TC = SE.getSmallConstantMaxTripCount(L, Predicates);
   if (!TC)
     return false;
 
@@ -810,13 +809,13 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
   return isPointerAlwaysReplaceable(From, To, DL);
 }
 
-bool llvm::isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE,
-                                         DominatorTree *DT,
-                                         AssumptionCache *AC) {
+bool llvm::isDereferenceableReadOnlyLoop(
+    Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    SmallVectorImpl<const SCEVPredicate *> *Predicates) {
   for (BasicBlock *BB : L->blocks()) {
     for (Instruction &I : *BB) {
       if (auto *LI = dyn_cast<LoadInst>(&I)) {
-        if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC))
+        if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
           return false;
       } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
         return false;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 1d3443588ce60d..233f8edca5b13b 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8191,10 +8191,13 @@ ScalarEvolution::getSmallConstantTripCount(const Loop *L,
   return getConstantTripCount(ExitCount);
 }
 
-unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) {
+unsigned ScalarEvolution::getSmallConstantMaxTripCount(
+    const Loop *L, SmallVectorImpl<const SCEVPredicate *> *Predicates) {
+
   const auto *MaxExitCount =
-      dyn_cast<SCEVConstant>(getConstantMaxBackedgeTakenCount(L));
-  return getConstantTripCount(MaxExitCount);
+      Predicates ? getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates)
+                 : getConstantMaxBackedgeTakenCount(L);
+  return getConstantTripCount(dyn_cast<SCEVConstant>(MaxExitCount));
 }
 
 unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) {
@@ -8303,6 +8306,11 @@ const SCEV *ScalarEvolution::getPredicatedSymbolicMaxBackedgeTakenCount(
   return getPredicatedBackedgeTakenInfo(L).getSymbolicMax(L, this, &Preds);
 }
 
+const SCEV *ScalarEvolution::getPredicatedConstantMaxBackedgeTakenCount(
+    const Loop *L, SmallVectorImpl<const SCEVPredicate *> &Preds) {
+  return getPredicatedBackedgeTakenInfo(L).getConstantMax(this, &Preds);
+}
+
 bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) {
   return getBackedgeTakenInfo(L).isConstantMaxOrZero(this);
 }
@@ -8624,15 +8632,19 @@ ScalarEvolution::BackedgeTakenInfo::getExitNotTaken(
 }
 
 /// getConstantMax - Get the constant max backedge taken count for the loop.
-const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
-  auto PredicateNotAlwaysTrue = [](const ExitNotTakenInfo &ENT) {
-    return !ENT.hasAlwaysTruePredicate();
-  };
-
-  if (!getConstantMax() || any_of(ExitNotTaken, PredicateNotAlwaysTrue))
+const SCEV *ScalarEvolution::BackedgeTakenInfo::getConstantMax(
+    ScalarEvolution *SE,
+    SmallVectorImpl<const SCEVPredicate *> *Predicates) const {
+  if (!getConstantMax())
     return SE->getCouldNotCompute();
 
+  for (const auto &ENT : ExitNotTaken)
+    if (!ENT.hasAlwaysTruePredicate()) {
+      if (!Predicates)
+        return SE->getCouldNotCompute();
+      append_range(*Predicates, ENT.Predicates);
+    }
+
   assert((isa<SCEVCouldNotCompute>(getConstantMax()) ||
           isa<SCEVConstant>(getConstantMax())) &&
          "No point in having a non-constant max backedge taken count!");
@@ -13749,8 +13761,28 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
     for (const auto *P : Preds)
       P->print(OS, 4);
   }
+  Preds.clear();
 
+  auto *PredConstantMax =
+      SE->getPredicatedConstantMaxBackedgeTakenCount(L, Preds);
+  if (PredConstantMax != ConstantBTC) {
+    assert(!Preds.empty() &&
+           "different predicated constant max BTC but no predicates");
+    OS << "Loop ";
+    L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+    OS << ": ";
+    if (!isa<SCEVCouldNotCompute>(PredConstantMax)) {
+      OS << "Predicated constant max backedge-taken count is ";
+      PrintSCEVWithTypeHint(OS, PredConstantMax);
+    } else
+      OS << "Unpredictable predicated constant max backedge-taken count.";
+    OS << "\n";
+    OS << " Predicates:\n";
+    for (const auto *P : Preds)
+      P->print(OS, 4);
+  }
   Preds.clear();
+
   auto *PredSymbolicMax =
       SE->getPredicatedSymbolicMaxBackedgeTakenCount(L, Preds);
   if (SymbolicBTC != PredSymbolicMax) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index a4787483813a9a..b767372a56b914 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1334,11 +1334,17 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     // we restrict this to loads; stores are more complicated due to
     // concurrency restrictions.
     ScalarEvolution &SE = *PSE.getSE();
+    SmallVector<const SCEVPredicate *, 4> Predicates;
     for (Instruction &I : *BB) {
       LoadInst *LI = dyn_cast<LoadInst>(&I);
+      // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
+      // that it will consider loops that need guarding by SCEV checks. The
+      // vectoriser will generate these checks if we decide to vectorise.
       if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
-          isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC))
+          isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC,
+                                            &Predicates))
         SafePointers.insert(LI->getPointerOperand());
+      Predicates.clear();
     }
   }
 
@@ -1564,7 +1570,9 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
          "Expected latch predecessor to be the early exiting block");
 
   // TODO: Handle loops that may fault.
-  if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC)) {
+  Predicates.clear();
+  if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
+                                     &Predicates)) {
     reportVectorizationFailure(
         "Loop may fault",
         "Cannot vectorize potentially faulting early exit loop",
diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
index 6d64f76494638f..f7a18c77a82c8f 100644
--- a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
+++ b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
@@ -109,6 +109,9 @@ define void @ule_from_zero_no_nuw(i32 %M, i32 %N) {
 ; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
@@ -238,6 +241,9 @@ define void @sle_from_int_min_no_nsw(i32 %M, i32 %N) {
 ; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))<nsw>)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nssw>
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nssw>
 ; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))<nsw>)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nssw>
diff --git a/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll b/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
index 471954f44311d4..a1538fd78ba17d 100644
--- a/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
@@ -59,6 +59,9 @@ define void @sle_pre_inc_infinite(i32 %len) {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (0 smax (1 + (sext i32 %len to i64))<nsw>)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i64 2147483648
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nssw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (0 smax (1 + (sext i32 %len to i64))<nsw>)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nssw>
@@ -130,6 +133,9 @@ define void @ule_pre_inc_infinite(i32 %len) {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (1 + (zext i32 %len to i64))<nuw><nsw>
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (1 + (zext i32 %len to i64))<nuw><nsw>
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
diff --git a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
index 49288c85897fd9..3022281658a75f 100644
--- a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
@@ -240,6 +240,9 @@ define void @test_zext(i64 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (%N /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,2}<nuw><%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<nuw><%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (%N /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,2}<nuw><%for.body> Added Flags: <nusw>
diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-exit-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-exit-count.ll
index de214183710ab3..3b398d422e36a3 100644
--- a/llvm/test/Analysis/ScalarEvolution/predicated-exit-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/predicated-exit-count.ll
@@ -30,6 +30,10 @@ define i32 @multiple_exits_with_predicates(ptr %src1, ptr readonly %src2, i32 %e
 ; CHECK-NEXT:     Predicates:
 ; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-EMPTY:
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i32 1023
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (1023 umin (-1 + (1 umax %end)))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
index 2ec6158e9b0920..ee6052685b43b5 100644
--- a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
@@ -20,6 +20,9 @@ define void @test1(i64 %x, ptr %a, ptr %b) {
 ; CHECK-NEXT:     Predicates:
 ; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
 ; CHECK-EMPTY:
+; CHECK-NEXT:  Loop %header: Predicated constant max backedge-taken count is i64 -2
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
@@ -71,6 +74,9 @@ define void @test2(i64 %x, ptr %a) {
 ; CHECK-NEXT:     Predicates:
 ; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
 ; CHECK-EMPTY:
+; CHECK-NEXT:  Loop %header: Predicated constant max backedge-taken count is i64 -2
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
index b313842ad5e1a9..2ee2ec53f6c9e9 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
@@ -61,6 +61,9 @@ define void @nw_implies_nsw(i16 %n) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (128 + (-128 smax %n))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {-128,+,1}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i16 -32641
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-128,+,1}<%for.body> Added Flags: <nssw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (128 + (-128 smax %n))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {-128,+,1}<%for.body> Added Flags: <nssw>
@@ -110,6 +113,9 @@ define void @actually_infinite() {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is i16 257
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i16 257
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is i16 257
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
@@ -138,6 +144,9 @@ define void @rhs_mustexit_1(i16 %n.raw) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<nw><%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i16 -2
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<nw><%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<nw><%for.body> Added Flags: <nusw>
@@ -266,6 +275,9 @@ define void @neg_rhs_maybe_infinite(i16 %n.raw) {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i16 -2
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
@@ -391,6 +403,9 @@ define void @ult_constant_rhs_stride2_neg(i16 %n.raw, i8 %start) {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((256 + (-1 * (zext i8 (2 + %start) to i16))<nsw>)<nsw> /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {(2 + %start),+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated constant max backedge-taken count is i16 128
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {(2 + %start),+,2}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((256 + (-1 * (zext i8 (2 + %start) to i16))<nsw>)<nsw> /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {(2 + %start),+,2}<%for.body> Added Flags: <nusw>
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index a7c9a18127ade5..1ef01e3b793d5b 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -296,3 +296,130 @@ latch:
 loop_exit:
   ret i8 %accum.next
 }
+
+
+define i32 @loop_requires_scev_predicate(ptr %dest, i32 %end) {
+; CHECK-LABEL: @loop_requires_scev_predicate(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[P1]])
+; CHECK-NEXT:    call void @init(ptr [[P2]])
+; CHECK-NEXT:    [[END_CLAMPED:%.*]] = and i32 [[END:%.*]], 1023
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[END]] to i10
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i10 [[TMP0]] to i64
+; CHECK-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i32 [[UMAX]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 1, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i8 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP2]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE5:%.*]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P2]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DEST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    store i32 [[TMP18]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5]]
+; CHECK:       pred.store.if4:
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; CHECK:       pred.store.continue5:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[DOWORK:%.*]] = icmp ne i32 [[TMP26]], 0
+; CHECK-NEXT:    br i1 [[DOWORK]], label [[FOR_DOWORK:%.*]], label [[FOR_INC]]
+; CHECK:       for.dowork:
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[GEP_IND]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IND_NEXT]] = add i8 [[IND]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32
+; CHECK-NEXT:    [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %p1 = alloca [1024 x i32]
+  %p2 = alloca [1024 x i32]
+  call void @init(ptr %p1)
+  call void @init(ptr %p2)
+  %end.clamped = and i32 %end, 1023
+  br label %for.body
+
+for.body:
+  %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ]
+  %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %gep.ind
+  %0 = load i32, ptr %arrayidx, align 4
+  %dowork = icmp ne i32 %0, 0
+  br i1 %dowork, label %for.dowork, label %for.inc
+
+for.dowork:
+  %arrayidx3 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind
+  %1 = load i32, ptr %arrayidx3, align 4
+  %add = add i32 %0, %1
+  %arrayidx5 = getelementptr inbounds i32, ptr %dest, i64 %gep.ind
+  store i32 %add, ptr %arrayidx5, align 4
+  br label %for.inc
+
+for.inc:
+  %ind.next = add i8 %ind, 1
+  %conv = zext i8 %ind.next to i32
+  %gep.ind.next = add i64 %gep.ind, 1
+  %cmp = icmp ult i32 %conv, %end.clamped
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
index dcf5c9d8ac64d1..936c07b4853a38 100644
--- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
@@ -1621,12 +1621,12 @@ loop.end:
 
 
 ; The form of the induction variables requires SCEV predicates.
-; TODO: We should fix isDereferenceableAndAlignedInLoop and
-; getSmallConstantMaxTripCount to cope with SCEV predicates when
-; requesting the small constant max trip count.
 define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check'
-; DEBUG:       LV: Not vectorizing: Loop may fault.
+; DEBUG:       LV: Found an early exit. Retrying with speculative exit count.
+; DEBUG-NEXT:  LV: Found speculative backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32)))<nsw>
+; DEBUG-NEXT:  LV: We can vectorize this loop!
+; DEBUG-NEXT:  LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported.
 ; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check(
 ; CHECK-SAME: i32 [[END:%.*]]) {
 ; CHECK-NEXT:  entry: