From 5ca26d769deedc931ce19b4a68a68c799f8d7564 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Thu, 12 Dec 2024 10:00:24 +0000 Subject: [PATCH] [AArch64][SME2] Improve register allocation of multi-vector SME intrinsics (#116399) The FORM_TRANSPOSED_REG_TUPLE pseudos have been created to improve register allocation for intrinsics which use strided and contiguous multi-vector registers, avoiding unnecessary copies. If the operands of the pseudo are copies where the source register is in the StridedOrContiguous class, the pseudo is used by getRegAllocationHints to suggest a contigious multi-vector register which matches the subregister sequence used by the operands. If the operands do not match this pattern, the pseudos are expanded to a REG_SEQUENCE. Patch contains changes by Matthew Devereau. --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 34 ++ .../Target/AArch64/AArch64ISelLowering.cpp | 71 +++ .../Target/AArch64/AArch64RegisterInfo.cpp | 52 ++ llvm/lib/Target/AArch64/AArch64RegisterInfo.h | 5 + llvm/lib/Target/AArch64/SMEInstrFormats.td | 28 +- .../AArch64/sme2-intrinsics-int-dots.ll | 492 +++++++++++++----- .../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 284 ++++++++-- 7 files changed, 805 insertions(+), 161 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 055cb3cefcedf9..0ce4b8971625c1 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -67,6 +67,10 @@ class AArch64ExpandPseudo : public MachineFunctionPass { TargetRegisterClass ContiguousClass, TargetRegisterClass StridedClass, unsigned ContiguousOpc, unsigned StridedOpc); + bool expandFormTuplePseudo(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + unsigned Size); bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); @@ -1142,6 +1146,32 @@ bool AArch64ExpandPseudo::expandMultiVecPseudo( return true; } +bool AArch64ExpandPseudo::expandFormTuplePseudo( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, unsigned Size) { + assert(Size == 2 || Size == 4 && "Invalid Tuple Size"); + MachineInstr &MI = *MBBI; + Register ReturnTuple = MI.getOperand(0).getReg(); + + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + for (unsigned I = 0; I < Size; ++I) { + Register FormTupleOpReg = MI.getOperand(I + 1).getReg(); + Register ReturnTupleSubReg = + TRI->getSubReg(ReturnTuple, AArch64::zsub0 + I); + // Add copies to ensure the subregisters remain in the correct order + // for any contigious operation they are used by. + if (FormTupleOpReg != ReturnTupleSubReg) + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORR_ZZZ)) + .addReg(ReturnTupleSubReg, RegState::Define) + .addReg(FormTupleOpReg) + .addReg(FormTupleOpReg); + } + + MI.eraseFromParent(); + return true; +} + /// If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, @@ -1724,6 +1754,10 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return expandMultiVecPseudo( MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED); + case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO: + return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2); + case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO: + return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4); } return false; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cee609ed1e2f6f..41e0214dab6c73 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8581,6 +8581,56 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { return ZExtBool; } +// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the +// input operands are copy nodes where the source register is in a +// StridedOrContiguous class. For example: +// +// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO .. +// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous +// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous +// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO .. +// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous +// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous +// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr +// +bool shouldUseFormStridedPseudo(MachineInstr &MI) { + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + + const TargetRegisterClass *RegClass = nullptr; + switch (MI.getOpcode()) { + case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO: + RegClass = &AArch64::ZPR2StridedOrContiguousRegClass; + break; + case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO: + RegClass = &AArch64::ZPR4StridedOrContiguousRegClass; + break; + default: + llvm_unreachable("Unexpected opcode."); + } + + MCRegister SubReg = MCRegister::NoRegister; + for (unsigned I = 1; I < MI.getNumOperands(); ++I) { + MachineOperand &MO = MI.getOperand(I); + assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE"); + + MachineOperand *Def = MRI.getOneDef(MO.getReg()); + if (!Def || !Def->getParent()->isCopy()) + return false; + + const MachineOperand &CopySrc = Def->getParent()->getOperand(1); + unsigned OpSubReg = CopySrc.getSubReg(); + if (SubReg == MCRegister::NoRegister) + SubReg = OpSubReg; + + MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg()); + if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg || + MRI.getRegClass(CopySrcOp->getReg()) != RegClass) + return false; + } + + return true; +} + void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { // Live-in physreg copies that are glued to SMSTART are applied as @@ -8606,6 +8656,27 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, } } + if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || + MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) { + // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies + // from a StridedOrContiguous class, fall back on REG_SEQUENCE node. + if (shouldUseFormStridedPseudo(MI)) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(TargetOpcode::REG_SEQUENCE), + MI.getOperand(0).getReg()); + + for (unsigned I = 1; I < MI.getNumOperands(); ++I) { + MIB.add(MI.getOperand(I)); + MIB.addImm(AArch64::zsub0 + (I - 1)); + } + + MI.eraseFromParent(); + return; + } + // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that // have nothing to do with VG, were it not that they are used to materialise a // frame-address. If they contain a frame-index to a scalable vector, this diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 133cc1344b98ff..85a7663993a046 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1081,6 +1081,58 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } } +// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation +// where a consecutive multi-vector tuple is constructed from the same indices +// of multiple strided loads. This may still result in unnecessary copies +// between the loads and the tuple. Here we try to return a hint to assign the +// contiguous ZPRMulReg starting at the same register as the first operand of +// the pseudo, which should be a subregister of the first strided load. +// +// For example, if the first strided load has been assigned $z16_z20_z24_z28 +// and the operands of the pseudo are each accessing subregister zsub2, we +// should look through through Order to find a contiguous register which +// begins with $z24 (i.e. $z24_z25_z26_z27). +// +bool AArch64RegisterInfo::getRegAllocationHints( + Register VirtReg, ArrayRef Order, + SmallVectorImpl &Hints, const MachineFunction &MF, + const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { + if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO && + MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); + + unsigned FirstOpSubReg = MI.getOperand(1).getSubReg(); + switch (FirstOpSubReg) { + case AArch64::zsub0: + case AArch64::zsub1: + case AArch64::zsub2: + case AArch64::zsub3: + break; + default: + continue; + } + + // Look up the physical register mapped to the first operand of the pseudo. + Register FirstOpVirtReg = MI.getOperand(1).getReg(); + if (!VRM->hasPhys(FirstOpVirtReg)) + continue; + + MCRegister TupleStartReg = + getSubReg(VRM->getPhys(FirstOpVirtReg), FirstOpSubReg); + for (unsigned I = 0; I < Order.size(); ++I) + if (MCRegister R = getSubReg(Order[I], AArch64::zsub0)) + if (R == TupleStartReg) + Hints.push_back(Order[I]); + } + + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, + VRM); +} + unsigned AArch64RegisterInfo::getLocalAddressRegister( const MachineFunction &MF) const { const auto &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 5c8a5e029584fc..11da624af4881b 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -134,6 +134,11 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo { unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; + bool getRegAllocationHints(Register VirtReg, ArrayRef Order, + SmallVectorImpl &Hints, + const MachineFunction &MF, const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const override; + unsigned getLocalAddressRegister(const MachineFunction &MF) const; bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index a67093b1a58c3d..b62ffcbebc652a 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -35,6 +35,30 @@ def tileslicerange0s4 : ComplexPattern", []>; let WantsRoot = true in def am_sme_indexed_b4 : ComplexPattern">; +// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to +// improve register allocation for intrinsics which use strided and contiguous +// multi-vector registers, avoiding unnecessary copies. +// If the operands of the pseudo are copies where the source register is in +// the StridedOrContiguous class, the pseudo is used to provide a hint to the +// register allocator suggesting a contigious multi-vector register which +// matches the subregister sequence used by the operands. +// If the operands do not match this pattern, the pseudos are expanded +// to a REG_SEQUENCE using the post-isel hook. + +def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO : + Pseudo<(outs ZPR2Mul2:$tup), + (ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{ + let hasSideEffects = 0; + let hasPostISelHook = 1; +} + +def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO : + Pseudo<(outs ZPR4Mul4:$tup), + (ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{ + let hasSideEffects = 0; + let hasPostISelHook = 1; +} + def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore, [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>; @@ -173,14 +197,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)), (!cast(name # _PSEUDO) $base, $offset, - (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>; + (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>; class SME2_ZA_TwoOp_VG4_Multi_Index_Pat : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)), (!cast(name # _PSEUDO) $base, $offset, - (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), + (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), zpr_ty:$Zm, imm_ty:$i)>; class SME2_Sat_Shift_VG2_Pat diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 1e835c92ba9e4c..ef569e480ea3d6 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s target triple="aarch64-linux-gnu" @@ -26,18 +26,18 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, %unused, < define void @udot_multi_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z26.d, z7.d -; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d -; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: mov z29.d, z2.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] -; CHECK-NEXT: mov z28.d, z1.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } -; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h } +; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h } ; CHECK-NEXT: ret %zn4, %zn5, %zn6, %zn7) #0 { call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, %zn0, %zn1, %zn2, %zn3, @@ -68,18 +68,18 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z26.d, z7.d -; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d -; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: mov z29.d, z2.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] -; CHECK-NEXT: mov z28.d, z1.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } -; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b } +; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b } ; CHECK-NEXT: ret %zn4, %zn5, %zn6, %zn7) #0 { call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, %zn0, %zn1, %zn2, %zn3, @@ -110,18 +110,18 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, %unused, < define void @udot_multi_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z26.d, z7.d -; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d -; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: mov z29.d, z2.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] -; CHECK-NEXT: mov z28.d, z1.d -; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } -; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h } +; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h } ; CHECK-NEXT: ret %zn4, %zn5, %zn6, %zn7) #1 { call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, %zn0, %zn1, %zn2, %zn3, @@ -152,18 +152,18 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, %unused, < define void @usdot_multi_za32_u8_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z26.d, z7.d -; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d -; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: mov z29.d, z2.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] -; CHECK-NEXT: mov z28.d, z1.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } -; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b } +; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b } ; CHECK-NEXT: ret %zn4, %zn5, %zn6, %zn7) #0 { call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, %zn0, %zn1, %zn2, %zn3, @@ -197,18 +197,18 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, %unused, < define void @sdot_multi_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z26.d, z7.d -; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d -; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: mov z29.d, z2.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] -; CHECK-NEXT: mov z28.d, z1.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } -; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h } +; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h } ; CHECK-NEXT: ret %zn4, %zn5, %zn6, %zn7) #0 { call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, %zn0, %zn1, %zn2, %zn3, @@ -239,18 +239,18 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z26.d, z7.d -; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d -; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: mov z29.d, z2.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] -; CHECK-NEXT: mov z28.d, z1.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } -; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b } +; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b } ; CHECK-NEXT: ret %zn4, %zn5, %zn6, %zn7) #0 { call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, %zn0, %zn1, %zn2, %zn3, @@ -281,18 +281,18 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, %unused, < define void @sdot_multi_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z26.d, z7.d -; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z25.d, z6.d -; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z24.d, z5.d -; CHECK-NEXT: mov z29.d, z2.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] -; CHECK-NEXT: mov z28.d, z1.d -; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } -; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h } +; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h } ; CHECK-NEXT: ret %zn4, %zn5, %zn6, %zn7) #1 { call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, %zn0, %zn1, %zn2, %zn3, @@ -309,9 +309,7 @@ define void @sdot_multi_za64_u16_vg1x4(i32 %slice, %unused, < define void @udot_single_za32_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #0 { ; CHECK-LABEL: udot_single_za32_u16_vg1x2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h ; CHECK-NEXT: ret @@ -324,11 +322,7 @@ define void @udot_single_za32_u16_vg1x2(i32 %slice, %unused, define void @udot_single_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: udot_single_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h ; CHECK-NEXT: ret @@ -341,9 +335,7 @@ define void @udot_single_za32_u16_vg1x4(i32 %slice, %unused, define void @udot_single_za32_u8_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #0 { ; CHECK-LABEL: udot_single_za32_u8_vg1x2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b ; CHECK-NEXT: ret @@ -356,11 +348,7 @@ define void @udot_single_za32_u8_vg1x2(i32 %slice, %unused, < define void @udot_single_za32_u8_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: udot_single_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b ; CHECK-NEXT: ret @@ -373,9 +361,7 @@ define void @udot_single_za32_u8_vg1x4(i32 %slice, %unused, < define void @udot_single_za64_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #1 { ; CHECK-LABEL: udot_single_za64_u16_vg1x2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h ; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h ; CHECK-NEXT: ret @@ -388,11 +374,7 @@ define void @udot_single_za64_u16_vg1x2(i32 %slice, %unused, define void @udot_single_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #1 { ; CHECK-LABEL: udot_single_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h ; CHECK-NEXT: ret @@ -405,9 +387,7 @@ define void @udot_single_za64_u16_vg1x4(i32 %slice, %unused, define void @usdot_single_za32_u8_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #0 { ; CHECK-LABEL: usdot_single_za32_u8_vg1x2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b ; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b ; CHECK-NEXT: ret @@ -420,11 +400,7 @@ define void @usdot_single_za32_u8_vg1x2(i32 %slice, %unused, define void @usdot_single_za32_u8_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: usdot_single_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b ; CHECK-NEXT: ret @@ -440,9 +416,7 @@ define void @usdot_single_za32_u8_vg1x4(i32 %slice, %unused, define void @sdot_single_za32_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #0 { ; CHECK-LABEL: sdot_single_za32_u16_vg1x2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h ; CHECK-NEXT: ret @@ -455,11 +429,7 @@ define void @sdot_single_za32_u16_vg1x2(i32 %slice, %unused, define void @sdot_single_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: sdot_single_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h ; CHECK-NEXT: ret @@ -472,9 +442,7 @@ define void @sdot_single_za32_u16_vg1x4(i32 %slice, %unused, define void @sdot_single_za32_u8_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #0 { ; CHECK-LABEL: sdot_single_za32_u8_vg1x2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b ; CHECK-NEXT: ret @@ -487,11 +455,7 @@ define void @sdot_single_za32_u8_vg1x2(i32 %slice, %unused, < define void @sdot_single_za32_u8_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: sdot_single_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b ; CHECK-NEXT: ret @@ -504,9 +468,7 @@ define void @sdot_single_za32_u8_vg1x4(i32 %slice, %unused, < define void @sdot_single_za64_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #1 { ; CHECK-LABEL: sdot_single_za64_u16_vg1x2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h ; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h ; CHECK-NEXT: ret @@ -519,11 +481,7 @@ define void @sdot_single_za64_u16_vg1x2(i32 %slice, %unused, define void @sdot_single_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #1 { ; CHECK-LABEL: sdot_single_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h ; CHECK-NEXT: ret @@ -536,9 +494,7 @@ define void @sdot_single_za64_u16_vg1x4(i32 %slice, %unused, define void @sudot_single_za32_u8_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #0 { ; CHECK-LABEL: sudot_single_za32_u8_vg1x2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b ; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b ; CHECK-NEXT: ret @@ -551,11 +507,7 @@ define void @sudot_single_za32_u8_vg1x2(i32 %slice, %unused, define void @sudot_single_za32_u8_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: sudot_single_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b ; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b ; CHECK-NEXT: ret @@ -571,8 +523,8 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: udot_lane_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3] ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3] ; CHECK-NEXT: ret @@ -605,8 +553,8 @@ define void @udot_lane_za32_u8_vg1x2(i32 %slice, %unused, %unused, %unused, , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + ret void +} + +define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: udot_form_4x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + define void @udot_lane_za64_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #1 { ; CHECK-LABEL: udot_lane_za64_u16_vg1x2: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1] ; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1] ; CHECK-NEXT: ret @@ -654,8 +676,8 @@ define void @udot_lane_za64_u16_vg1x4(i32 %slice, %unused, %unused, %unused, %unused, , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + ret void +} + +define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: usdot_form_4x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} ; == Multi, indexed (signed) == @@ -710,8 +805,8 @@ define void @sdot_lane_za32_u16_vg1x2(i32 %slice, %unused, %unused, %unused, %unused, %unused, , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + ret void +} + +define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: sdot_form_4x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + define void @sdot_lane_za64_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #1 { ; CHECK-LABEL: sdot_lane_za64_u16_vg1x2: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1] ; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1] ; CHECK-NEXT: ret @@ -793,8 +962,8 @@ define void @sdot_lane_za64_u16_vg1x4(i32 %slice, %unused, %unused, %unused, %unused, , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + ret void +} + +define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: sudot_form_4x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + attributes #0 = { nounwind "target-features"="+sme2" } attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" } - ; == Multi, multi (unsigned) declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, , , , ) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index a0d8c18f55c3a0..49106e12378bea 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -1,15 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s - +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s ; == FVDOT == define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, %zn1, %zn2, %zm) { ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: ret @@ -25,9 +22,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, %zn1, %zn2, %zm) { ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: ret @@ -43,9 +38,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, %zn1, %zn2, %zm) { ; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: ret @@ -58,11 +51,7 @@ define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, % define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_svdot_lane_za32_vg1x4_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: svdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: ret @@ -75,11 +64,7 @@ define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, % define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_svdot_lane_za64_vg1x4_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: svdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1] ; CHECK-NEXT: svdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1] ; CHECK-NEXT: ret @@ -89,15 +74,87 @@ define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, % ret void } +define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: svdot_form_2x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: add x9, x0, x1 +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + ret void +} + +define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: svdot_form_4x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} ; == UVDOT == define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, %zn1, %zn2, %zm) { ; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: ret @@ -110,11 +167,7 @@ define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, % define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_uvdot_lane_za32_vg1x4_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: uvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: ret @@ -127,11 +180,7 @@ define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, % define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_uvdot_lane_za64_vg1x4_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: uvdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1] ; CHECK-NEXT: uvdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1] ; CHECK-NEXT: ret @@ -141,17 +190,87 @@ define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, % ret void } +define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: uvdot_form_2x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: add x9, x0, x1 +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + ret void +} + +define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: uvdot_form_4x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} ; == SUVDOT == define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_suvdot_lane_za32_vg1x4_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: suvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: ret @@ -161,17 +280,62 @@ define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, ret void } +define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: suvdot_form_4x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} ; == USVDOT == define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_usvdot_lane_za32_vg1x4_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 -; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: usvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: ret @@ -181,6 +345,58 @@ define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, ret void } +define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { +; CHECK-LABEL: usvdot_form_4x_tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + +attributes #0 = { nounwind "target-features"="+sme2" } +attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" } ; == FVDOT == declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32, , , , i32)