1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12 /// with sequential versions where possible.
14 //===----------------------------------------------------------------------===//
17 #include "GCNSubtarget.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/LiveRegMatrix.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/CodeGen/VirtRegMap.h"
25 #include "llvm/InitializePasses.h"
29 #define DEBUG_TYPE "amdgpu-nsa-reassign"
31 STATISTIC(NumNSAInstructions,
32 "Number of NSA instructions with non-sequential address found");
33 STATISTIC(NumNSAConverted,
34 "Number of NSA instructions changed to sequential");
38 class GCNNSAReassign : public MachineFunctionPass {
42 GCNNSAReassign() : MachineFunctionPass(ID) {
43 initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
46 bool runOnMachineFunction(MachineFunction &MF) override;
48 StringRef getPassName() const override { return "GCN NSA Reassign"; }
50 void getAnalysisUsage(AnalysisUsage &AU) const override {
51 AU.addRequired<LiveIntervals>();
52 AU.addRequired<VirtRegMap>();
53 AU.addRequired<LiveRegMatrix>();
55 MachineFunctionPass::getAnalysisUsage(AU);
60 NOT_NSA, // Not an NSA instruction
61 FIXED, // NSA which we cannot modify
62 NON_CONTIGUOUS, // NSA with non-sequential address which we can try
64 CONTIGUOUS // NSA with all sequential address registers
67 const GCNSubtarget *ST;
69 const MachineRegisterInfo *MRI;
71 const SIRegisterInfo *TRI;
81 const MCPhysReg *CSRegs;
83 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
85 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
86 unsigned StartReg) const;
88 bool canAssign(unsigned StartReg, unsigned NumRegs) const;
90 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
93 } // End anonymous namespace.
95 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
97 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
98 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
99 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
100 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
104 char GCNNSAReassign::ID = 0;
106 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
109 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
110 unsigned StartReg) const {
111 unsigned NumRegs = Intervals.size();
113 for (unsigned N = 0; N < NumRegs; ++N)
114 if (VRM->hasPhys(Intervals[N]->reg()))
115 LRM->unassign(*Intervals[N]);
117 for (unsigned N = 0; N < NumRegs; ++N)
118 if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
121 for (unsigned N = 0; N < NumRegs; ++N)
122 LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
127 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
128 for (unsigned N = 0; N < NumRegs; ++N) {
129 unsigned Reg = StartReg + N;
130 if (!MRI->isAllocatable(Reg))
133 for (unsigned I = 0; CSRegs[I]; ++I)
134 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
135 !LRM->isPhysRegUsed(CSRegs[I]))
143 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
144 unsigned NumRegs = Intervals.size();
146 if (NumRegs > MaxNumVGPRs)
148 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
150 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
151 if (!canAssign(Reg, NumRegs))
154 if (tryAssignRegisters(Intervals, Reg))
161 GCNNSAReassign::NSA_Status
162 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
163 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
165 return NSA_Status::NOT_NSA;
167 switch (Info->MIMGEncoding) {
168 case AMDGPU::MIMGEncGfx10NSA:
169 case AMDGPU::MIMGEncGfx11NSA:
172 return NSA_Status::NOT_NSA;
176 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
178 unsigned VgprBase = 0;
180 for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
181 const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
182 Register Reg = Op.getReg();
183 if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
184 return NSA_Status::FIXED;
186 Register PhysReg = VRM->getPhys(Reg);
190 return NSA_Status::FIXED;
192 // TODO: address the below limitation to handle GFX11 BVH instructions
193 // Bail if address is not a VGPR32. That should be possible to extend the
194 // optimization to work with subregs of a wider register tuples, but the
195 // logic to find free registers will be much more complicated with much
196 // less chances for success. That seems reasonable to assume that in most
197 // cases a tuple is used because a vector variable contains different
198 // parts of an address and it is either already consecutive or cannot
199 // be reassigned if not. If needed it is better to rely on register
200 // coalescer to process such address tuples.
201 if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
202 return NSA_Status::FIXED;
204 // InlineSpiller does not call LRM::assign() after an LI split leaving
205 // it in an inconsistent state, so we cannot call LRM::unassign().
206 // See llvm bug #48911.
207 // Skip reassign if a register has originated from such split.
208 // FIXME: Remove the workaround when bug #48911 is fixed.
209 if (VRM->getPreSplitReg(Reg))
210 return NSA_Status::FIXED;
212 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
214 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
215 return NSA_Status::FIXED;
217 for (auto U : MRI->use_nodbg_operands(Reg)) {
219 return NSA_Status::FIXED;
220 const MachineInstr *UseInst = U.getParent();
221 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
222 return NSA_Status::FIXED;
225 if (!LIS->hasInterval(Reg))
226 return NSA_Status::FIXED;
231 else if (VgprBase + I != PhysReg)
235 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
238 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
239 ST = &MF.getSubtarget<GCNSubtarget>();
240 if (ST->getGeneration() < GCNSubtarget::GFX10)
243 MRI = &MF.getRegInfo();
244 TRI = ST->getRegisterInfo();
245 VRM = &getAnalysis<VirtRegMap>();
246 LRM = &getAnalysis<LiveRegMatrix>();
247 LIS = &getAnalysis<LiveIntervals>();
249 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
250 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
251 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
252 CSRegs = MRI->getCalleeSavedRegs();
254 using Candidate = std::pair<const MachineInstr*, bool>;
255 SmallVector<Candidate, 32> Candidates;
256 for (const MachineBasicBlock &MBB : MF) {
257 for (const MachineInstr &MI : MBB) {
258 switch (CheckNSA(MI)) {
261 case NSA_Status::CONTIGUOUS:
262 Candidates.push_back(std::pair(&MI, true));
264 case NSA_Status::NON_CONTIGUOUS:
265 Candidates.push_back(std::pair(&MI, false));
266 ++NumNSAInstructions;
272 bool Changed = false;
273 for (auto &C : Candidates) {
277 const MachineInstr *MI = C.first;
278 if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
279 // Already happen to be fixed.
285 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
287 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
289 SmallVector<LiveInterval *, 16> Intervals;
290 SmallVector<MCRegister, 16> OrigRegs;
291 SlotIndex MinInd, MaxInd;
292 for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
293 const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
294 Register Reg = Op.getReg();
295 LiveInterval *LI = &LIS->getInterval(Reg);
296 if (llvm::is_contained(Intervals, LI)) {
297 // Same register used, unable to make sequential
301 Intervals.push_back(LI);
302 OrigRegs.push_back(VRM->getPhys(Reg));
304 // The address input is undef, so it doesn't contribute to the relevant
305 // range. Seed a reasonable index range if required.
307 MinInd = MaxInd = LIS->getInstructionIndex(*MI);
310 MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
311 MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
314 if (Intervals.empty())
317 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
318 << "\tOriginal allocation:\t";
321 << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
324 bool Success = scavengeRegs(Intervals);
326 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
327 if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
330 // Check we did not make it worse for other instructions.
331 auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
332 [this](const Candidate &C, SlotIndex I) {
333 return LIS->getInstructionIndex(*C.first) < I;
335 for (auto E = Candidates.end(); Success && I != E &&
336 LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
337 if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
339 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
345 for (unsigned I = 0; I < Info->VAddrOperands; ++I)
346 if (VRM->hasPhys(Intervals[I]->reg()))
347 LRM->unassign(*Intervals[I]);
349 for (unsigned I = 0; I < Info->VAddrOperands; ++I)
350 LRM->assign(*Intervals[I], OrigRegs[I]);
358 dbgs() << "\tNew allocation:\t\t ["
359 << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
361 << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)