From 303c8d20601d810c177f6646f771c1eb3f29ab8c Mon Sep 17 00:00:00 2001 From: Rin Dobrescu Date: Fri, 11 Oct 2024 12:29:44 +0100 Subject: [PATCH] [AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling model. (#111538) Introduce a description of late forwarding to the Neoverse-V1 Scheduling model. --- .../Target/AArch64/AArch64SchedNeoverseV1.td | 207 ++- .../llvm-mca/AArch64/Neoverse/V1-forwarding.s | 1421 +++++++++++++++++ .../AArch64/Neoverse/V1-neon-instructions.s | 138 +- 3 files changed, 1645 insertions(+), 121 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td index f7e6545f0dd386..fb4d2f3d7bcd3a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td @@ -469,6 +469,89 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01, V1UnitV, V1UnitV, V1UnitV, V1UnitV, V1UnitV, V1UnitV]>; +//===----------------------------------------------------------------------===// +// Define forwarded types + +// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for +// consumers of 64 bit multiply high operations? +def V1Wr_IM : SchedWriteRes<[V1UnitM]> { let Latency = 2; } +def V1Wr_IMA : SchedWriteRes<[V1UnitM0]> { let Latency = 2; } +def V1WriteIM : SchedWriteVariant< + [SchedVar, + SchedVar]>; +def V1Rd_IMA : SchedReadAdvance<1, [V1Wr_IMA]>; + +def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>; + +def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; } +def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>; + +def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>; + +def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>; + +def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; } +def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>; + +def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; } +def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>; + +def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; } +def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>; + +def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>; + +def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>; + +def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; } +def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>; + +def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>; + +def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; } +def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>; + +def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>; + +def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; } +def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>; + +def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; } +def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>; + +def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>; + +def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; } +def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>; + +def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; } +def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>; + +let Latency = 5, NumMicroOps = 2 in +def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>; +def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>; + +def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>; + +def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; } +def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>; + +def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; } +def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>; +def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>; +def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>; // Miscellaneous Instructions // ----------------------------------------------------------------------------- @@ -553,16 +636,19 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>; def : SchedAlias; def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + // Multiply -// Multiply accumulate -// Multiply accumulate, long -// Multiply long -def V1WriteIM : SchedWriteVariant< - [SchedVar, - SchedVar]>; -def : SchedAlias; -def : SchedAlias; +// Multiply accumulate, W-form +// Multiply accumulate, X-form +def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA], + (instregex "^M(ADD|SUB)[WX]rrr$")>; +// Multiply accumulate long +// Multiply long +def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA], + (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; // Multiply high def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>; @@ -680,10 +766,11 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>; def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>; // FP multiply -def : SchedAlias; +def : WriteRes { let Latency = 3; } // FP multiply accumulate -def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; +def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA], + (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; // FP round to integral def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$", @@ -824,7 +911,7 @@ def : SchedAlias; // ASIMD absolute diff accum // ASIMD absolute diff accum long // ASIMD pairwise add and accumulate long -def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>; +def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>; // ASIMD arith, reduce, 4H/4S // ASIMD max/min, reduce, 4H/4S @@ -843,23 +930,26 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$", // ASIMD dot product // ASIMD dot product using signed and unsigned integers -def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>; +def : InstRW<[V1Wr_VDOT, V1Rd_VDOT], + (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>; -// ASIMD matrix multiply- accumulate -def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>; +// ASIMD matrix multiply-accumulate +def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>; // ASIMD multiply +def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>; + // ASIMD multiply accumulate +def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>; + // ASIMD multiply accumulate long +def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; + // ASIMD multiply accumulate high +def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; + // ASIMD multiply accumulate saturating long -def : InstRW<[V1Write_4c_1V02], - (instregex "^MUL(v[148]i16|v[124]i32)$", - "^SQR?DMULH(v[48]i16|v[24]i32)$", - "^ML[AS](v[148]i16|v[124]i32)$", - "^[SU]ML[AS]Lv", - "^SQRDML[AS]H(v[148]i16|v[124]i32)$", - "^SQDML[AS]Lv")>; +def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>; // ASIMD multiply/multiply long (8x8) polynomial def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>; @@ -868,11 +958,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>; def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>; // ASIMD shift accumulate +def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>; + // ASIMD shift by immed, complex // ASIMD shift by register, complex def : InstRW<[V1Write_4c_1V13], - (instregex "^[SU]R?SRAv", - "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$", + (instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$", "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$", "^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv", "^[SU]Q?RSHLv", "^[SU]QSHLv")>; @@ -890,16 +981,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv", // ASIMD FP absolute value/difference // ASIMD FP arith, normal // ASIMD FP compare -// ASIMD FP complex add // ASIMD FP max/min, normal // ASIMD FP max/min, pairwise // ASIMD FP negate // Covered by "SchedAlias (WriteV[dq]...)" above +// ASIMD FP complex add +def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>; + // ASIMD FP complex multiply add +def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>; + +// ASIMD FP multiply +def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>; + // ASIMD FP multiply accumulate -def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$", - "^FML[AS]v")>; +def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>; + +// ASIMD FP multiply accumulate long +def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>; // ASIMD FP convert, long (F16 to F32) def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>; @@ -953,12 +1053,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>; // ASIMD FP max/min, reduce, Q-form F16 def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>; -// ASIMD FP multiply -def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>; - -// ASIMD FP multiply accumulate long -def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>; - // ASIMD FP round, D-form F32 and Q-form F64 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>; @@ -976,13 +1070,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>; def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>; // ASIMD dot product -def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>; +def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>; // ASIMD matrix multiply accumulate -def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>; +def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>; // ASIMD multiply accumulate long -def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>; +def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>; // Scalar convert, F32 to BF16 def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>; @@ -1300,7 +1394,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>; // ----------------------------------------------------------------------------- // CRC checksum ops -def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>; +def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>; // SVE Predicate instructions @@ -1440,13 +1534,14 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", "^[SU]DIV_ZPZZ_D")>; // Dot product, 8 bit -def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>; +def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>; // Dot product, 8 bit, using signed and unsigned integers -def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>; +def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB], + (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>; // Dot product, 16 bit -def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>; +def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>; // Duplicate, immediate and indexed form def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$", @@ -1488,7 +1583,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$", "^MOVPRFX_ZZ$")>; // Matrix multiply-accumulate -def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; +def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; // Multiply, B, H, S element size def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", @@ -1497,12 +1592,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", "^[SU]MULH_ZPZZ_[BHS]")>; // Multiply, D element size -// Multiply accumulate, D element size def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D", "^MUL_ZPZZ_D", "^[SU]MULH_(ZPmZ|ZZZ)_D", - "^[SU]MULH_ZPZZ_D", - "^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>; + "^[SU]MULH_ZPZZ_D")>; + +// Multiply accumulate, D element size +def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD], + (instregex "^ML[AS]_ZPZZZ_D")>; +def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD], + (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; // Multiply accumulate, B, H, S element size // NOTE: This is not specified in the SOG. @@ -1583,8 +1682,8 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$", def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>; // Floating point complex multiply add -def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$", - "^FCMLA_ZZZI_[HS]$")>; +def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>; +def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>; // Floating point convert, long or narrow (F16 to F32 or F32 to F16) // Floating point convert to integer, F32 @@ -1623,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]", "^FMUL_ZPZ[IZ]_[HSD]")>; // Floating point multiply accumulate +def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA], + (instregex "^FN?ML[AS]_ZPmZZ_[HSD]", + "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>; +def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA], + (instregex "^FML[AS]_ZZZI_[HSD]", + "^FN?ML[AS]_ZPZZZ_[HSD]")>; + // Floating point reciprocal step -def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$", - "^FN?ML[AS]_ZPZZZ_[HSD]", - "^FML[AS]_ZZZI_[HSD]$", - "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>; +def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; // Floating point reciprocal estimate, F16 def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>; @@ -1681,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$", def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; // Dot product -def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; +def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>; +def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>; // Multiply accumulate long -def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; +def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; // SVE Load instructions diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s new file mode 100644 index 00000000000000..4de37f96000520 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s @@ -0,0 +1,1421 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s + +# LLVM-MCA-BEGIN madd +mul x0, x0, x0 +madd x0, x1, x2, x0 +madd x0, x1, x2, x0 +madd x0, x0, x0, x0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smaddl +mul x0, x0, x0 +smaddl x0, w1, w2, x0 +smaddl x0, w1, w2, x0 +smaddl x0, w0, w0, x0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmadd +fadd d0, d0, d0 +fmadd d0, d1, d2, d0 +fmul d0, d0, d0 +fmadd d0, d1, d2, d0 +fmadd d0, d1, d2, d0 +fmadd d0, d0, d1, d2 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN saba +mul v0.4s, v0.4s, v0.4s +saba v0.4s, v1.4s, v2.4s +saba v0.4s, v1.4s, v2.4s +saba v0.4s, v0.4s, v1.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sadalp +mul v0.4s, v0.4s, v0.4s +sadalp v0.2d, v1.4s +sadalp v0.2d, v1.4s +sadalp v0.2d, v0.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sdot +mul v0.4s, v0.4s, v0.4s +sdot v0.4s, v1.16b, v2.16b +sdot v0.4s, v1.16b, v2.16b +sdot v0.4s, v0.16b, v1.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smmla +mul v0.4s, v0.4s, v0.4s +smmla v0.4s, v1.16b, v2.16b +smmla v0.4s, v1.16b, v2.16b +smmla v0.4s, v0.16b, v1.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN mla +mul v0.4s, v0.4s, v0.4s +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v0.4s, v1.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smlal2 +mul v0.4s, v0.4s, v0.4s +smlal2 v0.4s, v1.8h, v2.8h +smlal2 v0.4s, v1.8h, v2.8h +smlal2 v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN ssra +mul v0.4s, v0.4s, v0.4s +ssra v0.2d, v1.2d, #1 +ssra v0.2d, v1.2d, #1 +ssra v0.2d, v0.2d, #1 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fcmla +fmul v0.4s, v0.4s, v0.4s +fcmla v0.2d, v1.2d, v2.2d, #90 +fcmla v0.2d, v1.2d, v2.2d, #90 +fcmla v0.2d, v0.2d, v1.2d, #90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmla +fmul v0.2d, v0.2d, v0.2d +fmla v0.2d, v1.2d, v2.2d +fadd v0.2d, v0.2d, v0.2d +fmla v0.2d, v1.2d, v2.2d +fmla v0.2d, v1.2d, v2.2d +fmla v0.2d, v0.2d, v1.2d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmlal +fmul v0.2d, v0.2d, v0.2d +fmlal v0.4s, v1.4h, v2.4h +fadd v0.2d, v0.2d, v0.2d +fmlal v0.4s, v1.4h, v2.4h +fmlal v0.4s, v1.4h, v2.4h +fmlal v0.4s, v0.4h, v1.4h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfdot +fmul v0.2d, v0.2d, v0.2d +bfdot v0.4s, v1.8h, v2.8h +bfdot v0.4s, v1.8h, v2.8h +bfdot v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmmla +fmul v0.2d, v0.2d, v0.2d +bfmmla v0.4s, v1.8h, v2.8h +bfmmla v0.4s, v1.8h, v2.8h +bfmmla v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmlalb +fmul v0.2d, v0.2d, v0.2d +bfmlalb v0.4s, v1.8h, v2.8h +bfmlalb v0.4s, v1.8h, v2.8h +bfmlalb v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN crc32cb +mul w0, w0, w0 +crc32cb w0, w0, w1 +crc32cb w0, w0, w1 +crc32cb w0, w0, w0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sdot.s +mul z0.d, p0/m, z0.d, z0.d +sdot z0.s, z1.b, z2.b +sdot z0.s, z1.b, z2.b +sdot z0.s, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sudot +mul z0.d, p0/m, z0.d, z0.d +sdot z0.s, z1.b, z2.b[1] +sdot z0.s, z1.b, z2.b[1] +sdot z0.s, z0.b, z1.b[1] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sdot.d +mul z0.d, p0/m, z0.d, z0.d +sdot z0.d, z1.h, z2.h +sdot z0.d, z1.h, z2.h +sdot z0.d, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z smmla +mul z0.d, p0/m, z0.d, z0.d +smmla z0.s, z1.b, z2.b +smmla z0.s, z1.b, z2.b +smmla z0.s, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z mla.d +mul z0.d, p0/m, z0.d, z0.d +mla z0.d, p0/m, z1.d, z2.d +mla z0.d, p0/m, z1.d, z2.d +mla z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z mad.d +mul z0.d, p0/m, z0.d, z0.d +mad z0.d, p0/m, z1.d, z2.d +mad z0.d, p0/m, z1.d, z2.d +mad z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z msb.d +mul z0.d, p0/m, z0.d, z0.d +msb z0.d, p0/m, z1.d, z2.d +msb z0.d, p0/m, z1.d, z2.d +msb z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fcmla ZPmZZ +fmul z0.d, z0.d, z0.d +fcmla z0.d, p0/m, z1.d, z2.d, 90 +fcmla z0.d, p0/m, z1.d, z2.d, 90 +fcmla z0.d, p0/m, z0.d, z1.d, 90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fcmla ZZZI +fmul z0.d, z0.d, z0.d +fcmla z0.s, z1.s, z2.s[1], 90 +fcmla z0.s, z1.s, z2.s[1], 90 +fcmla z0.s, z0.s, z1.s[1], 90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmla ZPmZZ +fmul z0.d, z0.d, z0.d +fmla z0.d, p0/m, z1.d, z2.d +fmla z0.d, p0/m, z1.d, z2.d +fmla z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmla ZZZI +fmul z0.d, z0.d, z0.d +fmla z0.d, z1.d, z2.d[1] +fmla z0.d, z1.d, z2.d[1] +fmla z0.d, z0.d, z1.d[1] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z bfdot +fmul z0.d, z0.d, z0.d +bfdot z0.s, z1.h, z2.h +bfdot z0.s, z1.h, z2.h +bfdot z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z bfmmla +fmul z0.d, z0.d, z0.d +bfmmla z0.s, z1.h, z2.h +bfmmla z0.s, z1.h, z2.h +bfmmla z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmlalb +fmul z0.d, z0.d, z0.d +bfmlalb z0.s, z1.h, z2.h +bfmlalb z0.s, z1.h, z2.h +bfmlalb z0.s, z0.h, z1.h +# LLVM-MCA-END + +# CHECK: [0] Code Region - madd + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul x0, x0, x0 +# CHECK-NEXT: [0,1] D==eeER . .. madd x0, x1, x2, x0 +# CHECK-NEXT: [0,2] D===eeER . .. madd x0, x1, x2, x0 +# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0 +# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 +# CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 madd x0, x0, x0, x0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [1] Code Region - smaddl + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul x0, x0, x0 +# CHECK-NEXT: [0,1] D==eeER . .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [0,2] D===eeER . .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [0,3] D=====eeER. .. smaddl x0, w0, w0, x0 +# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 +# CHECK-NEXT: [1,1] D=========eeER .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,2] D==========eeER.. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,3] D============eeER smaddl x0, w0, w0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [2] Code Region - fmadd + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.35 +# CHECK-NEXT: IPC: 0.35 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeER. . . . . . .. fadd d0, d0, d0 +# CHECK-NEXT: [0,1] D==eeeeER . . . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,2] D======eeeER . . . . .. fmul d0, d0, d0 +# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmadd d0, d0, d1, d2 +# CHECK-NEXT: [1,0] D=================eeER . . .. fadd d0, d0, d0 +# CHECK-NEXT: [1,1] D===================eeeeER . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,2] D=======================eeeER . .. fmul d0, d0, d0 +# CHECK-NEXT: [1,3] D========================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,4] D==========================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,5] D==============================eeeeER fmadd d0, d0, d1, d2 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fadd d0, d0, d0 +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmul d0, d0, d0 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmadd d0, d0, d1, d2 +# CHECK-NEXT: 2 15.7 0.1 0.0 + +# CHECK: [3] Code Region - saba + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] D======================eeeeER saba v0.4s, v0.4s, v1.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [4] Code Region - sadalp + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . sadalp v0.2d, v0.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,3] D======================eeeeER sadalp v0.2d, v0.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 sadalp v0.2d, v0.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [5] Code Region - sdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 0.8 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeER. . . . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,2] D=====eeeER . . . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,3] D========eeeER . . . sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D===============eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,2] D================eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] D===================eeeER sdot v0.4s, v0.16b, v1.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 2. 2 11.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.8 0.1 0.0 + +# CHECK: [6] Code Region - smmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 0.8 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeER. . . . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,2] D=====eeeER . . . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,3] D========eeeER . . . smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D===============eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,2] D================eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] D===================eeeER smmla v0.4s, v0.16b, v1.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 2. 2 11.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.5 0.0 0.0 smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.8 0.1 0.0 + +# CHECK: [7] Code Region - mla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] D======================eeeeER mla v0.4s, v0.4s, v1.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [8] Code Region - smlal2 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER smlal2 v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [9] Code Region - ssra + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [0,3] D=========eeeeER . . . ssra v0.2d, v0.2d, #1 +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [1,2] D==================eeeeER. . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [1,3] D======================eeeeER ssra v0.2d, v0.2d, #1 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 ssra v0.2d, v0.2d, #1 +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [10] Code Region - fcmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D================eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,2] D==================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,3] D======================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [11] Code Region - fmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.35 +# CHECK-NEXT: IPC: 0.35 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D=eeeeER . . . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,2] D=====eeER. . . . . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: [1,0] D=================eeeER . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D==================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,2] D======================eeER . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] D========================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,4] D==========================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,5] D==============================eeeeER fmla v0.2d, v0.2d, v1.2d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 2. 2 14.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: 2 15.3 0.1 0.0 + +# CHECK: [12] Code Region - fmlal + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 2203 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456789 +# CHECK-NEXT: Index 0123456789 0123456789 0123456 + +# CHECK: [0,0] DeeeER . . . . . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,2] D========eeER . . . . . . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,4] D============eeeeeER. . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,5] D=================eeeeeER. . . . .. fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: [1,0] D======================eeeER . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D=========================eeeeeER . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,2] D==============================eeER. . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] D================================eeeeeER. .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,4] D==================================eeeeeER .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,5] D=======================================eeeeeER fmlal v0.4s, v0.4h, v1.4h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 12.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 15.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 2. 2 20.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 22.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 5. 2 29.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: 2 20.3 0.1 0.0 + +# CHECK: [13] Code Region - bfdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D================eeeeER . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER bfdot v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [14] Code Region - bfmmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1603 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D================eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D===================eeeeeER . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D======================eeeeeER. . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D===========================eeeeeER bfmmla v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 14.0 0.1 0.0 + +# CHECK: [15] Code Region - bfmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D================eeeeER . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER bfmlalb v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [16] Code Region - crc32cb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul w0, w0, w0 +# CHECK-NEXT: [0,1] D==eeER . .. crc32cb w0, w0, w1 +# CHECK-NEXT: [0,2] D===eeER . .. crc32cb w0, w0, w1 +# CHECK-NEXT: [0,3] D=====eeER. .. crc32cb w0, w0, w0 +# CHECK-NEXT: [1,0] D=======eeER .. mul w0, w0, w0 +# CHECK-NEXT: [1,1] D=========eeER .. crc32cb w0, w0, w1 +# CHECK-NEXT: [1,2] D==========eeER.. crc32cb w0, w0, w1 +# CHECK-NEXT: [1,3] D============eeER crc32cb w0, w0, w0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul w0, w0, w0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 crc32cb w0, w0, w1 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 crc32cb w0, w0, w1 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 crc32cb w0, w0, w0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [17] Code Region - Z sdot.s + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [18] Code Region - Z sudot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b[1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [19] Code Region - Z sdot.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [0,2] D======eeeeER . . . . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [0,3] D==========eeeeER . . . sdot z0.d, z0.h, z1.h +# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D===================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,2] D====================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,3] D========================eeeeER sdot z0.d, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sdot z0.d, z0.h, z1.h +# CHECK-NEXT: 2 13.3 0.1 0.0 + +# CHECK: [20] Code Region - Z smmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [0,2] D======eeeER . . .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [0,3] D=========eeeER. . .. smmla z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] D==================eeeER .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] D=====================eeeER smmla z0.s, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smmla z0.s, z0.b, z1.b +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [21] Code Region - Z mla.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER mla z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [22] Code Region - Z mad.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. mad z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER mad z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 mad z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [23] Code Region - Z msb.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. msb z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER msb z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 msb z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [24] Code Region - Z fcmla ZPmZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: 2 13.0 0.1 0.0 + +# CHECK: [25] Code Region - Z fcmla ZZZI + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.s, z0.s, z1.s[1], #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: 2 13.0 0.1 0.0 + +# CHECK: [26] Code Region - Z fmla ZPmZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [27] Code Region - Z fmla ZZZI + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, z0.d, z1.d[1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [28] Code Region - Z bfdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D======================eeeeER bfdot z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [29] Code Region - Z bfmmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1603 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D================eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D===================eeeeeER . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D======================eeeeeER. . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D===========================eeeeeER bfmmla z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: 2 14.0 0.1 0.0 + +# CHECK: [30] Code Region - bfmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D====================eeeeeER . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D=========================eeeeeER bfmlalb z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: 2 13.0 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s index 1e8df4770d7950..65b73177c7b70a 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s @@ -1365,8 +1365,8 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 fcmgt s10, s11, s12 # CHECK-NEXT: 1 2 0.25 fcmgt v0.4s, v0.4s, #0.0 # CHECK-NEXT: 1 2 0.25 fcmgt v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 2 0.25 fcmla v0.2s, v0.2s, v0.2s, #90 -# CHECK-NEXT: 1 2 0.25 fcmla v0.4s, v0.4s, v0.s[1], #0 +# CHECK-NEXT: 1 4 0.25 fcmla v0.2s, v0.2s, v0.2s, #90 +# CHECK-NEXT: 1 4 0.25 fcmla v0.4s, v0.4s, v0.s[1], #0 # CHECK-NEXT: 1 2 0.25 fcmle d20, d21, #0.0 # CHECK-NEXT: 1 2 0.25 fcmle s10, s11, #0.0 # CHECK-NEXT: 1 2 0.25 fcmle v0.2d, v0.2d, #0.0 @@ -1651,7 +1651,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 7 8 1.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] # CHECK-NEXT: 8 8 1.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 # CHECK-NEXT: 8 8 1.00 * ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8 -# CHECK-NEXT: 1 2 0.25 mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 mla v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 mls v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 2 0.25 mov b0, v0.b[15] # CHECK-NEXT: 1 2 0.25 mov d6, v0.d[1] @@ -1673,7 +1673,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 movi v0.2s, #8, msl #8 # CHECK-NEXT: 1 2 0.25 movi v0.4s, #255, lsl #24 # CHECK-NEXT: 1 2 0.25 movi v0.8b, #255 -# CHECK-NEXT: 1 2 0.25 mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 mul v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 mvni v0.2s, #0 # CHECK-NEXT: 1 2 0.25 mvni v0.4s, #16, msl #16 # CHECK-NEXT: 1 2 0.25 neg d29, d24 @@ -1780,10 +1780,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 2 4 1.00 scvtf v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 scvtf v0.4s, v0.4s, #3 # CHECK-NEXT: 4 6 1.00 scvtf v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sdot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 sdot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 sdot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 sdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 sdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 sdot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 shadd v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 shl d7, d10, #12 # CHECK-NEXT: 1 2 0.50 shl v0.16b, v0.16b, #3 @@ -1873,26 +1873,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 sqadd b20, b11, b15 # CHECK-NEXT: 1 2 0.25 sqadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.25 sqadd v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 2 0.25 sqdmlal d19, s24, s12 +# CHECK-NEXT: 1 4 0.50 sqdmlal d19, s24, s12 # CHECK-NEXT: 1 4 0.50 sqdmlal d8, s9, v0.s[1] # CHECK-NEXT: 1 4 0.50 sqdmlal s0, h0, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmlal s17, h27, h12 +# CHECK-NEXT: 1 4 0.50 sqdmlal s17, h27, h12 # CHECK-NEXT: 1 4 0.50 sqdmlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqdmlal2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 sqdmlal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqdmlsl d12, s23, s13 +# CHECK-NEXT: 1 4 0.50 sqdmlsl d12, s23, s13 # CHECK-NEXT: 1 4 0.50 sqdmlsl d8, s9, v0.s[1] # CHECK-NEXT: 1 4 0.50 sqdmlsl s0, h0, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmlsl s14, h12, h25 +# CHECK-NEXT: 1 4 0.50 sqdmlsl s14, h12, h25 # CHECK-NEXT: 1 4 0.50 sqdmlsl v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmlsl v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqdmlsl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 sqdmlsl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqdmulh h10, h11, h12 -# CHECK-NEXT: 1 2 0.25 sqdmulh h7, h15, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmulh s15, s14, v0.s[1] -# CHECK-NEXT: 1 2 0.25 sqdmulh s20, s21, s2 +# CHECK-NEXT: 1 4 0.50 sqdmulh h10, h11, h12 +# CHECK-NEXT: 1 4 0.50 sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmulh s20, s21, s2 # CHECK-NEXT: 1 4 0.50 sqdmulh v0.2s, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmulh v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 sqdmull d1, s1, v0.s[1] @@ -1914,34 +1914,34 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 sqneg v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 sqneg v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 sqneg v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqrdmlah h0, h1, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah s0, s1, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah h0, h1, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah s0, s1, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: 1 4 0.50 sqrdmlah h0, h1, h2 # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4h, v1.4h, v2.4h # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.8h, v1.8h, v2.8h # CHECK-NEXT: 1 4 0.50 sqrdmlah s0, s1, s2 # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.2s, v1.2s, v2.2s # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 1 2 0.25 sqrdmlsh h0, h1, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh s0, s1, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh h0, h1, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh s0, s1, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: 1 4 0.50 sqrdmlsh h0, h1, h2 # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4h, v1.4h, v2.4h # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.8h, v1.8h, v2.8h # CHECK-NEXT: 1 4 0.50 sqrdmlsh s0, s1, s2 # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.2s, v1.2s, v2.2s # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 1 2 0.25 sqrdmulh h10, h11, h12 -# CHECK-NEXT: 1 2 0.25 sqrdmulh h7, h15, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmulh s15, s14, v0.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmulh s20, s21, s2 +# CHECK-NEXT: 1 4 0.50 sqrdmulh h10, h11, h12 +# CHECK-NEXT: 1 4 0.50 sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmulh s20, s21, s2 # CHECK-NEXT: 1 4 0.50 sqrdmulh v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqrdmulh v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 sqrshl d31, d31, d31 @@ -2124,8 +2124,8 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 5 4 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], x5 # CHECK-NEXT: 1 2 0.25 sub d15, d5, d16 # CHECK-NEXT: 1 2 0.25 sub v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 1 2 0.25 sudot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 sudot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sudot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sudot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 suqadd b19, b14 # CHECK-NEXT: 1 2 0.25 suqadd d18, d22 # CHECK-NEXT: 1 2 0.25 suqadd h20, h15 @@ -2222,10 +2222,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 2 4 1.00 ucvtf v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 ucvtf v0.4s, v0.4s, #3 # CHECK-NEXT: 4 6 1.00 ucvtf v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 udot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 udot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 udot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 udot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 udot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 udot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 udot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 udot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 uhadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.25 uhadd v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 uhsub v0.4s, v0.4s, v0.4s @@ -2356,10 +2356,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 ursra v0.4s, v0.4s, #3 # CHECK-NEXT: 1 4 0.50 ursra v0.8b, v0.8b, #3 # CHECK-NEXT: 1 4 0.50 ursra v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 2 0.25 usdot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 usdot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 usdot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 usdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 usdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 usdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 usdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 usdot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.50 ushl d0, d0, d0 # CHECK-NEXT: 1 2 0.50 ushl v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.50 ushl v0.4s, v0.4s, v0.4s @@ -2465,7 +2465,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] -# CHECK-NEXT: - - - - 26.67 49.17 49.17 18.75 7.75 7.75 7.75 394.50 377.00 349.00 331.50 +# CHECK-NEXT: - - - - 26.67 49.17 49.17 18.75 7.75 7.75 7.75 401.00 370.50 355.50 325.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions: @@ -2892,7 +2892,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - 1.00 1.00 1.00 - - - - 1.00 1.00 1.00 1.00 ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] # CHECK-NEXT: - - - - 1.00 1.00 1.00 0.25 0.25 0.25 0.25 1.00 1.00 1.00 1.00 ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 # CHECK-NEXT: - - - - 1.00 1.00 1.00 0.25 0.25 0.25 0.25 1.00 1.00 1.00 1.00 ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mla v0.8b, v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mls v0.4h, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mov b0, v0.b[15] # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mov d6, v0.d[1] @@ -2914,7 +2914,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.2s, #8, msl #8 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.4s, #255, lsl #24 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.8b, #255 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mul v0.8b, v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mvni v0.2s, #0 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mvni v0.4s, #16, msl #16 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 neg d29, d24 @@ -3114,26 +3114,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd b20, b11, b15 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd v0.2s, v0.2s, v0.2s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlal d19, s24, s12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal d19, s24, s12 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal d8, s9, v0.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal s0, h0, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlal s17, h27, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal s17, h27, h12 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlsl d12, s23, s13 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl d12, s23, s13 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl d8, s9, v0.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl s0, h0, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlsl s14, h12, h25 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl s14, h12, h25 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh h10, h11, h12 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh h7, h15, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh s15, s14, v0.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh s20, s21, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh v0.2s, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh v0.4s, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmull d1, s1, v0.s[1] @@ -3155,34 +3155,34 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah h0, h1, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah s0, s1, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah h0, h1, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah s0, s1, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah h0, h1, h2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4h, v1.4h, v2.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.8h, v1.8h, v2.8h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah s0, s1, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.2s, v1.2s, v2.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh h0, h1, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh s0, s1, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh h0, h1, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh s0, s1, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh h0, h1, h2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4h, v1.4h, v2.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.8h, v1.8h, v2.8h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh s0, s1, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.2s, v1.2s, v2.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4s, v1.4s, v2.4s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh h10, h11, h12 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh h7, h15, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh s15, s14, v0.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh s20, s21, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh v0.4h, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - - 0.50 - 0.50 sqrshl d31, d31, d31