Skip to content

Commit

Permalink
[LoopVectorize][NFC] Rewrite tests to check output of vplan cost model (
Browse files Browse the repository at this point in the history
llvm#113697)

Currently it's very difficult to improve the cost model for tail-folded
loops because as soon as you add a VPInstruction::computeCost function
that adds the costs of instructions such as
VPInstruction::ActiveLaneMask
and VPInstruction::ExplicitVectorLength the assert in
LoopVectorizationPlanner::computeBestVF fails for some tests. This is
because the VF chosen by the legacy cost model doesn't match the vplan
cost model. See PR llvm#90191. This assert is currently making it difficult
to improve the cost model.

Hopefully we will be in a position to remove the assert soon, however
in order to do that we have to fix up a whole bunch of tests that rely
upon the legacy cost model output. I've tried my best to update
these tests to use vplan output instead.

There is still work needed for the VF=1 case because the vplan cost
model is not printed out in this case. I've not attempted to fix those
in this patch.
  • Loading branch information
david-arm authored Nov 19, 2024
1 parent 3093b29 commit 3097c60
Show file tree
Hide file tree
Showing 26 changed files with 379 additions and 277 deletions.
16 changes: 15 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
Expand Down Expand Up @@ -7424,7 +7425,20 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,

// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
#ifndef NDEBUG
unsigned EstimatedWidth = VF.getKnownMinValue();
if (VF.isScalable())
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
EstimatedWidth *= *VScale;
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
<< " (Estimated cost per lane: ");
if (Cost.isValid()) {
double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
} else /* No point dividing an invalid cost - it will still be invalid */
LLVM_DEBUG(dbgs() << "Invalid");
LLVM_DEBUG(dbgs() << ")\n");
#endif
return Cost;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
; %var4 a lower scalarization overhead.
;
; COST-LABEL: predicated_udiv_scalarized_operand
; COST: LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
;
;
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@
; CM: LV: Found uniform instruction: %a = extractvalue { i64, i64 } %sv, 0
; CM: LV: Found uniform instruction: %b = extractvalue { i64, i64 } %sv, 1

; Ensure the extractvalue + add instructions are hoisted out
; CM: vector.ph:
; CM: CLONE ir<%a> = extractvalue ir<%sv>
; CM: CLONE ir<%b> = extractvalue ir<%sv>
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
; CM: Successor(s): vector loop

; CM: LV: Scalar loop costs: 5.
; CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0
; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1

; Check that the extractvalue operands are actually free in vector code.

Expand Down Expand Up @@ -58,12 +63,14 @@ exit:
; Similar to the test case above, but checks getVectorCallCost as well.
declare float @powf(float, float) readnone nounwind

; CM: LV: Found uniform instruction: %a = extractvalue { float, float } %sv, 0
; CM: LV: Found uniform instruction: %b = extractvalue { float, float } %sv, 1
; Ensure the extractvalue + add instructions are hoisted out
; CM: vector.ph:
; CM: CLONE ir<%a> = extractvalue ir<%sv>
; CM: CLONE ir<%b> = extractvalue ir<%sv>
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
; CM: Successor(s): vector loop

; CM: LV: Scalar loop costs: 14.
; CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0
; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1

; FORCED-LABEL: define void @test_getVectorCallCost

Expand Down
14 changes: 7 additions & 7 deletions llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"

; CHECK-COST-LABEL: sadd
; CHECK-COST: Found an estimated cost of 6 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Cost of 4 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)

define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @saddsat(
Expand Down Expand Up @@ -129,10 +129,10 @@ while.end: ; preds = %while.body, %entry

; CHECK-COST-LABEL: umin
; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Cost of 1 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 16: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)

define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @umin(
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
target triple = "aarch64-unknown-linux-gnu"

; CHECK-COST: Checking a loop in 'fixed_width'
; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4
; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4
; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
; CHECK-COST: Selecting VF: 1.

; We should decide this loop is not worth vectorising using fixed width vectors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ target triple = "aarch64"
; due to invalid cost decisions. The loop below has a low maximum trip count,
; so will be masked.

; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load
; COST: Cost of 3000000 for VF 2: REPLICATE ir<%0> = load
; COST: Cost of 3000000 for VF 4: REPLICATE ir<%0> = load
; COST: Cost of 3000000 for VF 8: REPLICATE ir<%0> = load
; COST: Cost of 3000000 for VF 16: REPLICATE ir<%0> = load
; COST: LV: Selecting VF: 1.

define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ target triple = "aarch64--linux-gnu"

; CHECK-LABEL: all_scalar
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
;
define void @all_scalar(ptr %a, i64 %n) {
Expand All @@ -27,7 +26,6 @@ for.end:

; CHECK-LABEL: PR33193
; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions
%struct.a = type { i32, i8 }
define void @PR33193(ptr %a, i64 %n) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
;; registers required for a <vscale x 4 x fp128> when trying to maximize
;; vector bandwidth with SVE.

; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128
; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext ir<%load.in> to fp128

define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
; CHECK-LABEL: define void @load_ext_trunc_store(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,54 +1,59 @@
; REQUIRES: asserts
; RUN: opt -mtriple=aarch64 -mattr=+sve \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16

; GENERIC: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8)
; GENERIC: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4)
; GENERIC: LV: Selecting VF: vscale x 16

; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
; NEOVERSE-V1: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8)
; NEOVERSE-V1: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4)
; NEOVERSE-V1: LV: Selecting VF: vscale x 16

; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
; NEOVERSE-N2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5)
; NEOVERSE-N2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8)
; NEOVERSE-N2: LV: Selecting VF: vscale x 16

; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
; NEOVERSE-V2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5)
; NEOVERSE-V2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8)
; NEOVERSE-V2: LV: Selecting VF: 16

; VF-4: <4 x i32>
; VF-VSCALE4: <16 x i32>
; VF-16: <16 x i8>
; VF-VSCALE16: <vscale x 16 x i8>
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv
%0 = load i8, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
%1 = load i8, ptr %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %add, ptr %arrayidx5, align 4
%add = add nsw i8 %0, %1
%arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv
store i8 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop

exit:
ret void
}

22 changes: 13 additions & 9 deletions llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ target triple = "arm64-apple-ios5.0.0"

define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
; CHECK: LV: Checking a loop in 'selects_1'
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6

; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>

; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>

; CHECK: LV: Selecting VF: 4

entry:
Expand Down Expand Up @@ -48,9 +50,11 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo

define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
; CHECK: LV: Checking a loop in 'multi_user_cmp'
; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %cmp1 = fcmp olt float %load1, 0.000000e+00
; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
; CHECK: Cost of 1 for VF 16:
; CHECK: any-of reduction %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
; CHECK: Cost of 1 for VF 16:
; CHECK: any-of reduction %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
; CHECK: Cost of 4 for VF 16: WIDEN ir<%cmp1> = fcmp olt ir<%load1>, ir<0.000000e+00>
; CHECK: LV: Selecting VF: 16.
entry:
br label %for.body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

target triple = "aarch64-unknown-linux-gnu"

; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %addi7 = add i7 %indvars.iv1294, 0
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN ir<%addi7> = add ir<%indvars.iv1294>, ir<0>

define void @induction_i7(ptr %dst) #0 {
; CHECK-LABEL: define void @induction_i7(
Expand Down Expand Up @@ -71,9 +71,9 @@ for.end: ; preds = %for.body
}


; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %zexti3 = zext i3 %indvars.iv1294 to i64
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext ir<%indvars.iv1294> to i64

define void @induction_i3_zext(ptr %dst) #0 {
; CHECK-LABEL: define void @induction_i3_zext(
Expand Down
20 changes: 14 additions & 6 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@

target triple="aarch64-unknown-linux-gnu"

; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
; CHECK: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
; CHECK-LABEL: LV: Checking a loop in 'fadd_strict32'
; CHECK: Cost of 4 for VF vscale x 2:
; CHECK: in-loop reduction %add = fadd float %0, %sum.07
; CHECK: Cost of 8 for VF vscale x 4:
; CHECK: in-loop reduction %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Cost of 4 for VF vscale x 4:
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd float %0, %sum.07

define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
entry:
Expand All @@ -31,8 +36,11 @@ for.end:
}


; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
; CHECK-LABEL: LV: Checking a loop in 'fadd_strict64'
; CHECK: Cost of 4 for VF vscale x 2:
; CHECK: in-loop reduction %add = fadd double %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd double %0, %sum.07

define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
entry:
Expand Down
Loading

0 comments on commit 3097c60

Please sign in to comment.