Skip to content

Commit

Permalink
Allow to use a whole register file on gfx90a for VGPRs
Browse files Browse the repository at this point in the history
In a kernel which does not have calls or AGPR usage we can allocate
the whole vector register budget for VGPRs and have no AGPRs as
long as VGPRs stay addressable (i.e. below 256).

Patch by: Stanislav Mekhanoshin

Change-Id: I2ea6eea58a449cf12368a37af18a892220c6e23b
  • Loading branch information
kerbowa authored and zhang2amd committed Oct 26, 2021
1 parent 64dd121 commit 9bbd96f
Show file tree
Hide file tree
Showing 5 changed files with 769 additions and 9 deletions.
34 changes: 34 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -611,3 +611,37 @@ bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
}
return false;
}

bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
if (UsesAGPRs)
return *UsesAGPRs;

if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) ||
MF.getFrameInfo().hasCalls()) {
UsesAGPRs = true;
return true;
}

const MachineRegisterInfo &MRI = MF.getRegInfo();

for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
const Register Reg = Register::index2VirtReg(I);
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
if (RC && TRI->isAGPRClass(RC)) {
UsesAGPRs = true;
return true;
}
}

for (MCRegister Reg : AMDGPU::AGPR_32RegClass) {
if (MRI.isPhysRegUsed(Reg)) {
UsesAGPRs = true;
return true;
}
}

UsesAGPRs = false;
return false;
}
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// Current recorded maximum possible occupancy.
unsigned Occupancy;

mutable Optional<bool> UsesAGPRs;

MCPhysReg getNextUserSGPR() const;

MCPhysReg getNextSystemSGPR() const;
Expand Down Expand Up @@ -937,6 +939,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
Occupancy = Limit;
limitOccupancy(MF);
}

// \returns true if a function needs or may need AGPRs.
bool usesAGPRs(const MachineFunction &MF) const;
};

} // end namespace llvm
Expand Down
34 changes: 25 additions & 9 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,18 +510,36 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
// TODO: In an entry function without calls and AGPRs used it is possible
// to use the whole register budget for VGPRs. Even more it shall
// be possible to estimate maximum AGPR/VGPR pressure and split
// register file accordingly.
if (ST.hasGFX90AInsts())
MaxNumVGPRs /= 2;
unsigned MaxNumAGPRs = MaxNumVGPRs;
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();

if (ST.hasGFX90AInsts()) {
// In an entry function without calls and AGPRs used it is possible to use
// the whole register budget for VGPRs.

// TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and
// split register file accordingly.
if (MFI->usesAGPRs(MF)) {
MaxNumVGPRs /= 2;
MaxNumAGPRs = MaxNumVGPRs;
} else {
if (MaxNumVGPRs > TotalNumVGPRs) {
MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
MaxNumVGPRs = TotalNumVGPRs;
} else
MaxNumAGPRs = 0;
}
}

for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
}

for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
}

Expand All @@ -545,8 +563,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
}

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

Register ScratchRSrcReg = MFI->getScratchRSrcReg();
if (ScratchRSrcReg != AMDGPU::NoRegister) {
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
; GFX10CU-WAVE32: NumVgprs: 128
; GFX10CU-WAVE64: NumVgprs: 128
define amdgpu_kernel void @f512() #512 {
call void @foo()
call void @use256vgprs()
ret void
}
Expand All @@ -563,7 +564,11 @@ attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
; GFX10CU-WAVE32: NumVgprs: 64
; GFX10CU-WAVE64: NumVgprs: 64
define amdgpu_kernel void @f1024() #1024 {
call void @foo()
call void @use256vgprs()
ret void
}

attributes #1024 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }

declare void @foo()
Loading

0 comments on commit 9bbd96f

Please sign in to comment.