1 //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// \brief Try to reassign registers on GFX10+ to reduce register bank
13 /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14 /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15 /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16 /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
18 /// The shader can read one dword from each of these banks once per cycle.
19 /// If an instruction has to read more register operands from the same bank
20 /// an additional cycle is needed. HW attempts to pre-load registers through
21 /// input operand gathering, but a stall cycle may occur if that fails. For
22 /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23 /// potentially incuring 2 stall cycles.
25 /// The pass tries to reassign registers to reduce bank conflicts.
27 /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28 /// that 4 has to be subtracted from an SGPR bank number to get the real value.
29 /// This also corresponds to bit numbers in bank masks used in the pass.
31 //===----------------------------------------------------------------------===//
34 #include "AMDGPUSubtarget.h"
35 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
36 #include "SIInstrInfo.h"
37 #include "SIMachineFunctionInfo.h"
38 #include "llvm/ADT/SmallSet.h"
39 #include "llvm/ADT/Statistic.h"
40 #include "llvm/CodeGen/LiveInterval.h"
41 #include "llvm/CodeGen/LiveIntervals.h"
42 #include "llvm/CodeGen/LiveRegMatrix.h"
43 #include "llvm/CodeGen/MachineFunctionPass.h"
44 #include "llvm/CodeGen/MachineLoopInfo.h"
45 #include "llvm/CodeGen/VirtRegMap.h"
46 #include "llvm/InitializePasses.h"
47 #include "llvm/Support/MathExtras.h"
51 static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
52 cl::desc("Verify stall cycles in the regbanks reassign pass"),
53 cl::value_desc("0|1|2"),
54 cl::init(0), cl::Hidden);
56 #define DEBUG_TYPE "amdgpu-regbanks-reassign"
58 #define NUM_VGPR_BANKS 4
59 #define NUM_SGPR_BANKS 8
60 #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
61 #define SGPR_BANK_OFFSET NUM_VGPR_BANKS
62 #define VGPR_BANK_MASK 0xf
63 #define SGPR_BANK_MASK 0xff0
64 #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
66 STATISTIC(NumStallsDetected,
67 "Number of operand read stalls detected");
68 STATISTIC(NumStallsRecovered,
69 "Number of operand read stalls recovered");
73 class GCNRegBankReassign : public MachineFunctionPass {
77 OperandMask(unsigned r, unsigned s, unsigned m)
78 : Reg(r), SubReg(s), Mask(m) {}
86 Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
88 : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
90 bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
92 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
93 void dump(const GCNRegBankReassign *P) const {
95 dbgs() << P->printReg(Reg) << " to banks ";
96 dumpFreeBanks(FreeBanks);
97 dbgs() << " weight " << Weight << '\n';
107 class CandidateList : public std::list<Candidate> {
109 // Speedup subsequent sort.
110 void push(const Candidate&& C) {
111 if (C.Weight) push_back(C);
120 GCNRegBankReassign() : MachineFunctionPass(ID) {
121 initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
124 bool runOnMachineFunction(MachineFunction &MF) override;
126 StringRef getPassName() const override { return "GCN RegBank Reassign"; }
128 void getAnalysisUsage(AnalysisUsage &AU) const override {
129 AU.addRequired<MachineLoopInfo>();
130 AU.addRequired<LiveIntervals>();
131 AU.addRequired<VirtRegMap>();
132 AU.addRequired<LiveRegMatrix>();
133 AU.setPreservesAll();
134 MachineFunctionPass::getAnalysisUsage(AU);
138 const GCNSubtarget *ST;
140 const MachineRegisterInfo *MRI;
142 const SIRegisterInfo *TRI;
144 MachineLoopInfo *MLI;
152 unsigned MaxNumVGPRs;
154 unsigned MaxNumSGPRs;
158 SmallVector<OperandMask, 8> OperandMasks;
160 CandidateList Candidates;
162 const MCPhysReg *CSRegs;
164 // Returns bank for a phys reg.
165 unsigned getPhysRegBank(unsigned Reg) const;
167 // Return a bit set for each register bank used. 4 banks for VGPRs and
168 // 8 banks for SGPRs.
169 // Registers already processed and recorded in RegsUsed are excluded.
170 // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
171 unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
173 // Return number of stalls in the instructions.
174 // UsedBanks has bits set for the banks used by all operands.
175 // If Reg and Bank provided substitute the Reg with the Bank.
176 unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
177 unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
179 // Return true if register is regular VGPR or SGPR or their tuples.
180 // Returns false for special registers like m0, vcc etc.
181 bool isReassignable(unsigned Reg) const;
183 // Check if registers' defs are old and may be pre-loaded.
184 // Returns 0 if both registers are old enough, 1 or 2 if one or both
185 // registers will not likely be pre-loaded.
186 unsigned getOperandGatherWeight(const MachineInstr& MI,
189 unsigned StallCycles) const;
192 // Find all bank bits in UsedBanks where Mask can be relocated to.
193 unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
195 // Find all bank bits in UsedBanks where Mask can be relocated to.
196 // Bank is relative to the register and not its subregister component.
197 // Returns 0 is a register is not reassignable.
198 unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
199 unsigned UsedBanks) const;
201 // Add cadidate instruction to the work list.
202 void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
203 unsigned StallCycles);
205 // Collect cadidate instructions across function. Returns a number stall
206 // cycles detected. Only counts stalls if Collect is false.
207 unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
209 // Remove all candidates that read specified register.
210 void removeCandidates(unsigned Reg);
212 // Compute stalls within the uses of SrcReg replaced by a register from
213 // Bank. If Bank is -1 does not perform substitution. If Collect is set
214 // candidates are collected and added to work list.
215 unsigned computeStallCycles(unsigned SrcReg,
216 unsigned Reg = AMDGPU::NoRegister,
217 int Bank = -1, bool Collect = false);
219 // Search for a register in Bank unused within LI.
220 // Returns phys reg or NoRegister.
221 unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
223 // Try to reassign candidate. Returns number or stall cycles saved.
224 unsigned tryReassign(Candidate &C);
226 bool verifyCycles(MachineFunction &MF,
227 unsigned OriginalCycles, unsigned CyclesSaved);
230 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
232 Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
233 return Printable([Reg, SubReg, this](raw_ostream &OS) {
234 if (Register::isPhysicalRegister(Reg)) {
235 OS << llvm::printReg(Reg, TRI);
238 if (!VRM->isAssignedReg(Reg))
239 OS << "<unassigned> " << llvm::printReg(Reg, TRI);
241 OS << llvm::printReg(Reg, TRI) << '('
242 << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
244 OS << ':' << TRI->getSubRegIndexName(SubReg);
248 static Printable printBank(unsigned Bank) {
249 return Printable([Bank](raw_ostream &OS) {
250 OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
254 static void dumpFreeBanks(unsigned FreeBanks) {
255 for (unsigned L = 0; L < NUM_BANKS; ++L)
256 if (FreeBanks & (1 << L))
257 dbgs() << printBank(L) << ' ';
262 } // End anonymous namespace.
264 INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
266 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
267 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
268 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
269 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
270 INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
274 char GCNRegBankReassign::ID = 0;
276 char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
278 unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
279 assert(Register::isPhysicalRegister(Reg));
281 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
282 unsigned Size = TRI->getRegSizeInBits(*RC);
284 Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
286 if (TRI->hasVGPRs(RC)) {
287 Reg -= AMDGPU::VGPR0;
288 return Reg % NUM_VGPR_BANKS;
291 Reg = TRI->getEncodingValue(Reg) / 2;
292 return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
295 unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
297 if (Register::isVirtualRegister(Reg)) {
298 if (!VRM->isAssignedReg(Reg))
301 Reg = VRM->getPhys(Reg);
305 Reg = TRI->getSubReg(Reg, SubReg);
308 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
309 unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
311 Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
313 if (TRI->hasVGPRs(RC)) {
314 // VGPRs have 4 banks assigned in a round-robin fashion.
315 Reg -= AMDGPU::VGPR0;
316 unsigned Mask = (1 << Size) - 1;
318 // Bitmask lacks an extract method
319 for (unsigned I = 0; I < Size; ++I)
320 if (RegsUsed.test(Reg + I))
322 RegsUsed.set(Reg, Reg + Size);
324 Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
325 return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
328 // SGPRs have 8 banks holding 2 consequitive registers each.
329 Reg = TRI->getEncodingValue(Reg) / 2;
330 unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
331 if (Reg + StartBit >= RegsUsed.size())
336 unsigned Mask = (1 << Size) - 1;
338 for (unsigned I = 0; I < Size; ++I)
339 if (RegsUsed.test(StartBit + Reg + I))
341 RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
343 Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
344 : unsigned(Bank - SGPR_BANK_OFFSET);
345 Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
346 // Reserve 4 bank ids for VGPRs.
347 return Mask << SGPR_BANK_OFFSET;
350 unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
354 unsigned StallCycles = 0;
357 if (MI.isDebugValue())
361 OperandMasks.clear();
362 for (const auto& Op : MI.explicit_uses()) {
363 // Undef can be assigned to any register, so two vregs can be assigned
364 // the same phys reg within the same instruction.
365 if (!Op.isReg() || Op.isUndef())
368 Register R = Op.getReg();
369 if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
372 unsigned ShiftedBank = Bank;
374 if (Bank != -1 && R == Reg && Op.getSubReg()) {
375 unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
376 if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
377 // If a register spans all banks we cannot shift it to avoid conflict.
378 if (countPopulation(LM) >= NUM_VGPR_BANKS)
380 ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
381 } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
382 // If a register spans all banks we cannot shift it to avoid conflict.
383 if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
385 ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
386 (countTrailingZeros(LM) >> 1)) %
391 unsigned Mask = getRegBankMask(R, Op.getSubReg(),
392 (Reg == R) ? ShiftedBank : -1);
393 StallCycles += countPopulation(UsedBanks & Mask);
395 OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
401 unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
404 unsigned StallCycles) const
407 MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
408 MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
409 for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
410 if (MI.isDebugInstr())
413 if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
415 if (Def->modifiesRegister(Reg1, TRI))
417 if (Def->modifiesRegister(Reg2, TRI))
420 return countPopulation(Defs);
423 bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
424 if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
427 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
429 Register PhysReg = VRM->getPhys(Reg);
431 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
434 for (auto U : MRI->use_nodbg_operands(Reg)) {
437 const MachineInstr *UseInst = U.getParent();
438 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
442 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
443 if (TRI->hasVGPRs(RC))
446 unsigned Size = TRI->getRegSizeInBits(*RC);
448 PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
450 return AMDGPU::SGPR_32RegClass.contains(PhysReg);
453 unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
454 unsigned UsedBanks) const {
455 unsigned Size = countPopulation(Mask);
456 unsigned FreeBanks = 0;
457 unsigned Bank = findFirstSet(Mask);
461 // Find free VGPR banks
462 if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
463 for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
466 unsigned NewMask = ((1 << Size) - 1) << I;
467 NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
468 if (!(UsedBanks & NewMask))
474 // Find free SGPR banks
475 // SGPR tuples must be aligned, so step is size in banks it
477 Bank -= SGPR_BANK_OFFSET;
478 for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
481 unsigned NewMask = ((1 << Size) - 1) << I;
482 NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
483 if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
484 FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
490 unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
493 unsigned UsedBanks) const {
494 if (!isReassignable(Reg))
497 unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
499 unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
500 if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
501 unsigned Shift = countTrailingZeros(LM);
502 if (Shift >= NUM_VGPR_BANKS)
504 unsigned VB = FreeBanks & VGPR_BANK_MASK;
505 FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
507 } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
508 unsigned Shift = countTrailingZeros(LM) >> 1;
509 if (Shift >= NUM_SGPR_BANKS)
511 unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
512 FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
513 SGPR_BANK_SHIFTED_MASK;
514 FreeBanks <<= SGPR_BANK_OFFSET;
517 LLVM_DEBUG(if (FreeBanks) {
518 dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
519 << " to banks: "; dumpFreeBanks(FreeBanks);
525 void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
527 unsigned StallCycles) {
528 LLVM_DEBUG(MI.dump());
533 LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
535 for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
536 for (unsigned J = I + 1; J != E; ++J) {
537 if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
540 unsigned Reg1 = OperandMasks[I].Reg;
541 unsigned Reg2 = OperandMasks[J].Reg;
542 unsigned SubReg1 = OperandMasks[I].SubReg;
543 unsigned SubReg2 = OperandMasks[J].SubReg;
544 unsigned Mask1 = OperandMasks[I].Mask;
545 unsigned Mask2 = OperandMasks[J].Mask;
546 unsigned Size1 = countPopulation(Mask1);
547 unsigned Size2 = countPopulation(Mask2);
549 LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
550 " and " << printReg(Reg2, SubReg2) << '\n');
552 unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
553 Weight += MLI->getLoopDepth(MI.getParent()) * 10;
555 LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
557 unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
558 unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
560 Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
561 + ((Size2 > Size1) ? 1 : 0)));
563 Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
564 + ((Size1 > Size2) ? 1 : 0)));
569 unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
570 unsigned Reg, int Bank,
572 unsigned TotalStallCycles = 0;
573 unsigned UsedBanks = 0;
574 SmallSet<const MachineInstr *, 16> Visited;
576 for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
579 if (!Visited.insert(&MI).second)
581 unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
582 TotalStallCycles += StallCycles;
584 collectCandidates(MI, UsedBanks, StallCycles);
587 return TotalStallCycles;
590 unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
591 unsigned Bank) const {
592 const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
593 unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
595 unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
598 for (unsigned Reg : RC->getRegisters()) {
599 // Check occupancy limit.
600 if (TRI->isSubRegisterEq(Reg, MaxReg))
603 if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
606 for (unsigned I = 0; CSRegs[I]; ++I)
607 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
608 !LRM->isPhysRegUsed(CSRegs[I]))
609 return AMDGPU::NoRegister;
611 LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
613 if (!LRM->checkInterference(LI, Reg))
617 return AMDGPU::NoRegister;
620 unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
621 if (!LIS->hasInterval(C.Reg))
624 LiveInterval &LI = LIS->getInterval(C.Reg);
625 LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
628 // For each candidate bank walk all instructions in the range of live
629 // interval and check if replacing the register with one belonging to
630 // the candidate bank reduces conflicts.
632 unsigned OrigStalls = computeStallCycles(C.Reg);
633 LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
638 BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
639 bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
643 SmallVector<BankStall, 8> BankStalls;
645 for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
646 if (C.FreeBanks & (1 << Bank)) {
647 LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
648 unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
649 if (Stalls < OrigStalls) {
650 LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
652 BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
656 std::sort(BankStalls.begin(), BankStalls.end());
658 Register OrigReg = VRM->getPhys(C.Reg);
660 while (!BankStalls.empty()) {
661 BankStall BS = BankStalls.pop_back_val();
662 unsigned Reg = scavengeReg(LI, BS.Bank);
663 if (Reg == AMDGPU::NoRegister) {
664 LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
668 LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
669 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
670 << " in bank " << printBank(BS.Bank) << '\n');
672 LRM->assign(LI, Reg);
674 LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
676 return OrigStalls - BS.Stalls;
678 LRM->assign(LI, OrigReg);
683 unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
685 unsigned TotalStallCycles = 0;
687 for (MachineBasicBlock &MBB : MF) {
689 LLVM_DEBUG(if (Collect) {
690 if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
691 else dbgs() << MBB.getName(); dbgs() << ":\n";
694 for (MachineInstr &MI : MBB.instrs()) {
696 continue; // we analyze the instructions inside the bundle individually
698 unsigned UsedBanks = 0;
699 unsigned StallCycles = analyzeInst(MI, UsedBanks);
702 collectCandidates(MI, UsedBanks, StallCycles);
704 TotalStallCycles += StallCycles;
707 LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
710 return TotalStallCycles;
713 void GCNRegBankReassign::removeCandidates(unsigned Reg) {
714 Candidates.remove_if([Reg, this](const Candidate& C) {
715 return C.MI->readsRegister(Reg, TRI);
719 bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
720 unsigned OriginalCycles,
721 unsigned CyclesSaved) {
722 unsigned StallCycles = collectCandidates(MF, false);
723 LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
724 << " stall cycles left\n");
725 return StallCycles + CyclesSaved == OriginalCycles;
728 bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
729 ST = &MF.getSubtarget<GCNSubtarget>();
730 if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
733 MRI = &MF.getRegInfo();
734 TRI = ST->getRegisterInfo();
735 MLI = &getAnalysis<MachineLoopInfo>();
736 VRM = &getAnalysis<VirtRegMap>();
737 LRM = &getAnalysis<LiveRegMatrix>();
738 LIS = &getAnalysis<LiveIntervals>();
740 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
741 unsigned Occupancy = MFI->getOccupancy();
742 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
743 MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
744 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
745 MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
747 CSRegs = MRI->getCalleeSavedRegs();
749 RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
750 TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
752 LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
755 unsigned StallCycles = collectCandidates(MF);
756 NumStallsDetected += StallCycles;
758 LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
759 "function " << MF.getName() << '\n');
763 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
764 for (auto C : Candidates) C.dump(this);
767 unsigned CyclesSaved = 0;
768 while (!Candidates.empty()) {
769 Candidate C = Candidates.back();
770 unsigned LocalCyclesSaved = tryReassign(C);
771 CyclesSaved += LocalCyclesSaved;
773 if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
774 report_fatal_error("RegBank reassign stall cycles verification failed.");
776 Candidates.pop_back();
777 if (LocalCyclesSaved) {
778 removeCandidates(C.Reg);
779 computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
782 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
783 for (auto C : Candidates)
788 NumStallsRecovered += CyclesSaved;
790 LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
791 << " cycles saved in function " << MF.getName() << '\n');
795 if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
796 report_fatal_error("RegBank reassign stall cycles verification failed.");
800 return CyclesSaved > 0;