//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// \brief Try to reassign registers on GFX10+ to reduce register bank /// conflicts. /// /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. /// /// The shader can read one dword from each of these banks once per cycle. /// If an instruction has to read more register operands from the same bank /// an additional cycle is needed. HW attempts to pre-load registers through /// input operand gathering, but a stall cycle may occur if that fails. For /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, /// potentially incuring 2 stall cycles. /// /// The pass tries to reassign registers to reduce bank conflicts. /// /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so /// that 4 has to be subtracted from an SGPR bank number to get the real value. /// This also corresponds to bit numbers in bank masks used in the pass. /// //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/MathExtras.h" using namespace llvm; static cl::opt VerifyStallCycles("amdgpu-verify-regbanks-reassign", cl::desc("Verify stall cycles in the regbanks reassign pass"), cl::value_desc("0|1|2"), cl::init(0), cl::Hidden); #define DEBUG_TYPE "amdgpu-regbanks-reassign" #define NUM_VGPR_BANKS 4 #define NUM_SGPR_BANKS 8 #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) #define SGPR_BANK_OFFSET NUM_VGPR_BANKS #define VGPR_BANK_MASK 0xf #define SGPR_BANK_MASK 0xff0 #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) STATISTIC(NumStallsDetected, "Number of operand read stalls detected"); STATISTIC(NumStallsRecovered, "Number of operand read stalls recovered"); namespace { class GCNRegBankReassign : public MachineFunctionPass { class OperandMask { public: OperandMask(unsigned r, unsigned s, unsigned m) : Reg(r), SubReg(s), Mask(m) {} unsigned Reg; unsigned SubReg; unsigned Mask; }; class Candidate { public: Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, unsigned weight) : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump(const GCNRegBankReassign *P) const { MI->dump(); dbgs() << P->printReg(Reg) << " to banks "; dumpFreeBanks(FreeBanks); dbgs() << " weight " << Weight << '\n'; } #endif MachineInstr *MI; unsigned Reg; unsigned FreeBanks; unsigned Weight; }; class CandidateList : public std::list { public: // Speedup subsequent sort. void push(const Candidate&& C) { if (C.Weight) push_back(C); else push_front(C); } }; public: static char ID; public: GCNRegBankReassign() : MachineFunctionPass(ID) { initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); } bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "GCN RegBank Reassign"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } private: const GCNSubtarget *ST; const MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; MachineLoopInfo *MLI; VirtRegMap *VRM; LiveRegMatrix *LRM; LiveIntervals *LIS; unsigned MaxNumVGPRs; unsigned MaxNumSGPRs; BitVector RegsUsed; SmallVector OperandMasks; CandidateList Candidates; const MCPhysReg *CSRegs; // Returns bank for a phys reg. unsigned getPhysRegBank(unsigned Reg) const; // Return a bit set for each register bank used. 4 banks for VGPRs and // 8 banks for SGPRs. // Registers already processed and recorded in RegsUsed are excluded. // If Bank is not -1 assume Reg:SubReg to belong to that Bank. unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); // Return number of stalls in the instructions. // UsedBanks has bits set for the banks used by all operands. // If Reg and Bank provided substitute the Reg with the Bank. unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, unsigned Reg = AMDGPU::NoRegister, int Bank = -1); // Return true if register is regular VGPR or SGPR or their tuples. // Returns false for special registers like m0, vcc etc. bool isReassignable(unsigned Reg) const; // Check if registers' defs are old and may be pre-loaded. // Returns 0 if both registers are old enough, 1 or 2 if one or both // registers will not likely be pre-loaded. unsigned getOperandGatherWeight(const MachineInstr& MI, unsigned Reg1, unsigned Reg2, unsigned StallCycles) const; // Find all bank bits in UsedBanks where Mask can be relocated to. unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; // Find all bank bits in UsedBanks where Mask can be relocated to. // Bank is relative to the register and not its subregister component. // Returns 0 is a register is not reassignable. unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, unsigned UsedBanks) const; // Add cadidate instruction to the work list. void collectCandidates(MachineInstr& MI, unsigned UsedBanks, unsigned StallCycles); // Collect cadidate instructions across function. Returns a number stall // cycles detected. Only counts stalls if Collect is false. unsigned collectCandidates(MachineFunction &MF, bool Collect = true); // Remove all candidates that read specified register. void removeCandidates(unsigned Reg); // Compute stalls within the uses of SrcReg replaced by a register from // Bank. If Bank is -1 does not perform substitution. If Collect is set // candidates are collected and added to work list. unsigned computeStallCycles(unsigned SrcReg, unsigned Reg = AMDGPU::NoRegister, int Bank = -1, bool Collect = false); // Search for a register in Bank unused within LI. // Returns phys reg or NoRegister. unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; // Try to reassign candidate. Returns number or stall cycles saved. unsigned tryReassign(Candidate &C); bool verifyCycles(MachineFunction &MF, unsigned OriginalCycles, unsigned CyclesSaved); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) public: Printable printReg(unsigned Reg, unsigned SubReg = 0) const { return Printable([Reg, SubReg, this](raw_ostream &OS) { if (TargetRegisterInfo::isPhysicalRegister(Reg)) { OS << llvm::printReg(Reg, TRI); return; } if (!VRM->isAssignedReg(Reg)) OS << " " << llvm::printReg(Reg, TRI); else OS << llvm::printReg(Reg, TRI) << '(' << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; if (SubReg) OS << ':' << TRI->getSubRegIndexName(SubReg); }); } static Printable printBank(unsigned Bank) { return Printable([Bank](raw_ostream &OS) { OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); }); } static void dumpFreeBanks(unsigned FreeBanks) { for (unsigned L = 0; L < NUM_BANKS; ++L) if (FreeBanks & (1 << L)) dbgs() << printBank(L) << ' '; } #endif }; } // End anonymous namespace. INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", false, false) char GCNRegBankReassign::ID = 0; char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { assert (TargetRegisterInfo::isPhysicalRegister(Reg)); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getRegSizeInBits(*RC); if (Size > 32) Reg = TRI->getSubReg(Reg, AMDGPU::sub0); if (TRI->hasVGPRs(RC)) { Reg -= AMDGPU::VGPR0; return Reg % NUM_VGPR_BANKS; } Reg = TRI->getEncodingValue(Reg) / 2; return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; } unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, int Bank) { if (TargetRegisterInfo::isVirtualRegister(Reg)) { if (!VRM->isAssignedReg(Reg)) return 0; Reg = VRM->getPhys(Reg); if (!Reg) return 0; if (SubReg) Reg = TRI->getSubReg(Reg, SubReg); } const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getRegSizeInBits(*RC) / 32; if (Size > 1) Reg = TRI->getSubReg(Reg, AMDGPU::sub0); if (TRI->hasVGPRs(RC)) { // VGPRs have 4 banks assigned in a round-robin fashion. Reg -= AMDGPU::VGPR0; unsigned Mask = (1 << Size) - 1; unsigned Used = 0; // Bitmask lacks an extract method for (unsigned I = 0; I < Size; ++I) if (RegsUsed.test(Reg + I)) Used |= 1 << I; RegsUsed.set(Reg, Reg + Size); Mask &= ~Used; Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank); return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; } // SGPRs have 8 banks holding 2 consequitive registers each. Reg = TRI->getEncodingValue(Reg) / 2; unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); if (Reg + StartBit >= RegsUsed.size()) return 0; if (Size > 1) Size /= 2; unsigned Mask = (1 << Size) - 1; unsigned Used = 0; for (unsigned I = 0; I < Size; ++I) if (RegsUsed.test(StartBit + Reg + I)) Used |= 1 << I; RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); Mask &= ~Used; Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS : unsigned(Bank - SGPR_BANK_OFFSET); Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; // Reserve 4 bank ids for VGPRs. return Mask << SGPR_BANK_OFFSET; } unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, unsigned Reg, int Bank) { unsigned StallCycles = 0; UsedBanks = 0; if (MI.isDebugValue()) return 0; RegsUsed.reset(); OperandMasks.clear(); for (const auto& Op : MI.explicit_uses()) { // Undef can be assigned to any register, so two vregs can be assigned // the same phys reg within the same instruction. if (!Op.isReg() || Op.isUndef()) continue; unsigned R = Op.getReg(); if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) continue; unsigned ShiftedBank = Bank; if (Bank != -1 && R == Reg && Op.getSubReg()) { unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { // If a register spans all banks we cannot shift it to avoid conflict. if (countPopulation(LM) >= NUM_VGPR_BANKS) continue; ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { // If a register spans all banks we cannot shift it to avoid conflict. if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) continue; ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + (countTrailingZeros(LM) >> 1)) % NUM_SGPR_BANKS; } } unsigned Mask = getRegBankMask(R, Op.getSubReg(), (Reg == R) ? ShiftedBank : -1); StallCycles += countPopulation(UsedBanks & Mask); UsedBanks |= Mask; OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); } return StallCycles; } unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, unsigned Reg1, unsigned Reg2, unsigned StallCycles) const { unsigned Defs = 0; MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { if (MI.isDebugInstr()) continue; --Def; if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) continue; if (Def->modifiesRegister(Reg1, TRI)) Defs |= 1; if (Def->modifiesRegister(Reg2, TRI)) Defs |= 2; } return countPopulation(Defs); } bool GCNRegBankReassign::isReassignable(unsigned Reg) const { if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) return false; const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); unsigned PhysReg = VRM->getPhys(Reg); if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) return false; for (auto U : MRI->use_nodbg_operands(Reg)) { if (U.isImplicit()) return false; const MachineInstr *UseInst = U.getParent(); if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) return false; } const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); if (TRI->hasVGPRs(RC)) return true; unsigned Size = TRI->getRegSizeInBits(*RC); if (Size > 32) PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); return AMDGPU::SGPR_32RegClass.contains(PhysReg); } unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, unsigned UsedBanks) const { unsigned Size = countPopulation(Mask); unsigned FreeBanks = 0; unsigned Bank = findFirstSet(Mask); UsedBanks &= ~Mask; // Find free VGPR banks if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { if (Bank == I) continue; unsigned NewMask = ((1 << Size) - 1) << I; NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; if (!(UsedBanks & NewMask)) FreeBanks |= 1 << I; } return FreeBanks; } // Find free SGPR banks // SGPR tuples must be aligned, so step is size in banks it // crosses. Bank -= SGPR_BANK_OFFSET; for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { if (Bank == I) continue; unsigned NewMask = ((1 << Size) - 1) << I; NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; } return FreeBanks; } unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, unsigned UsedBanks) const { if (!isReassignable(Reg)) return 0; unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { unsigned Shift = countTrailingZeros(LM); if (Shift >= NUM_VGPR_BANKS) return 0; unsigned VB = FreeBanks & VGPR_BANK_MASK; FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & VGPR_BANK_MASK; } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { unsigned Shift = countTrailingZeros(LM) >> 1; if (Shift >= NUM_SGPR_BANKS) return 0; unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & SGPR_BANK_SHIFTED_MASK; FreeBanks <<= SGPR_BANK_OFFSET; } LLVM_DEBUG(if (FreeBanks) { dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) << " to banks: "; dumpFreeBanks(FreeBanks); dbgs() << '\n'; }); return FreeBanks; } void GCNRegBankReassign::collectCandidates(MachineInstr& MI, unsigned UsedBanks, unsigned StallCycles) { LLVM_DEBUG(MI.dump()); if (!StallCycles) return; LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { for (unsigned J = I + 1; J != E; ++J) { if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) continue; unsigned Reg1 = OperandMasks[I].Reg; unsigned Reg2 = OperandMasks[J].Reg; unsigned SubReg1 = OperandMasks[I].SubReg; unsigned SubReg2 = OperandMasks[J].SubReg; unsigned Mask1 = OperandMasks[I].Mask; unsigned Mask2 = OperandMasks[J].Mask; unsigned Size1 = countPopulation(Mask1); unsigned Size2 = countPopulation(Mask2); LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << " and " << printReg(Reg2, SubReg2) << '\n'); unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); Weight += MLI->getLoopDepth(MI.getParent()) * 10; LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); if (FreeBanks1) Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight + ((Size2 > Size1) ? 1 : 0))); if (FreeBanks2) Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight + ((Size1 > Size2) ? 1 : 0))); } } } unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, unsigned Reg, int Bank, bool Collect) { unsigned TotalStallCycles = 0; unsigned UsedBanks = 0; SmallSet Visited; for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { if (MI.isBundle()) continue; if (!Visited.insert(&MI).second) continue; unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); TotalStallCycles += StallCycles; if (Collect) collectCandidates(MI, UsedBanks, StallCycles); } return TotalStallCycles; } unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, unsigned Bank) const { const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs : MaxNumSGPRs; unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 : AMDGPU::SGPR0); for (unsigned Reg : RC->getRegisters()) { // Check occupancy limit. if (TRI->isSubRegisterEq(Reg, MaxReg)) break; if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) continue; for (unsigned I = 0; CSRegs[I]; ++I) if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && !LRM->isPhysRegUsed(CSRegs[I])) return AMDGPU::NoRegister; LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); if (!LRM->checkInterference(LI, Reg)) return Reg; } return AMDGPU::NoRegister; } unsigned GCNRegBankReassign::tryReassign(Candidate &C) { if (!LIS->hasInterval(C.Reg)) return 0; LiveInterval &LI = LIS->getInterval(C.Reg); LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); LI.dump()); // For each candidate bank walk all instructions in the range of live // interval and check if replacing the register with one belonging to // the candidate bank reduces conflicts. unsigned OrigStalls = computeStallCycles(C.Reg); LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); if (!OrigStalls) return 0; struct BankStall { BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } unsigned Bank; unsigned Stalls; }; SmallVector BankStalls; for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { if (C.FreeBanks & (1 << Bank)) { LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); if (Stalls < OrigStalls) { LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " << Stalls << '\n'); BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); } } } std::sort(BankStalls.begin(), BankStalls.end()); unsigned OrigReg = VRM->getPhys(C.Reg); LRM->unassign(LI); while (!BankStalls.empty()) { BankStall BS = BankStalls.pop_back_val(); unsigned Reg = scavengeReg(LI, BS.Bank); if (Reg == AMDGPU::NoRegister) { LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) << '\n'); continue; } LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") << " in bank " << printBank(BS.Bank) << '\n'); LRM->assign(LI, Reg); LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); return OrigStalls - BS.Stalls; } LRM->assign(LI, OrigReg); return 0; } unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, bool Collect) { unsigned TotalStallCycles = 0; for (MachineBasicBlock &MBB : MF) { LLVM_DEBUG(if (Collect) { if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); else dbgs() << MBB.getName(); dbgs() << ":\n"; }); for (MachineInstr &MI : MBB.instrs()) { if (MI.isBundle()) continue; // we analyze the instructions inside the bundle individually unsigned UsedBanks = 0; unsigned StallCycles = analyzeInst(MI, UsedBanks); if (Collect) collectCandidates(MI, UsedBanks, StallCycles); TotalStallCycles += StallCycles; } LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); } return TotalStallCycles; } void GCNRegBankReassign::removeCandidates(unsigned Reg) { Candidates.remove_if([Reg, this](const Candidate& C) { return C.MI->readsRegister(Reg, TRI); }); } bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, unsigned OriginalCycles, unsigned CyclesSaved) { unsigned StallCycles = collectCandidates(MF, false); LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles << " stall cycles left\n"); return StallCycles + CyclesSaved == OriginalCycles; } bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) return false; MRI = &MF.getRegInfo(); TRI = ST->getRegisterInfo(); MLI = &getAnalysis(); VRM = &getAnalysis(); LRM = &getAnalysis(); LIS = &getAnalysis(); const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned Occupancy = MFI->getOccupancy(); MaxNumVGPRs = ST->getMaxNumVGPRs(MF); MaxNumSGPRs = ST->getMaxNumSGPRs(MF); MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); CSRegs = MRI->getCalleeSavedRegs(); RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() << '\n'); unsigned StallCycles = collectCandidates(MF); NumStallsDetected += StallCycles; LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " "function " << MF.getName() << '\n'); Candidates.sort(); LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; for (auto C : Candidates) C.dump(this); dbgs() << "\n\n"); unsigned CyclesSaved = 0; while (!Candidates.empty()) { Candidate C = Candidates.back(); unsigned LocalCyclesSaved = tryReassign(C); CyclesSaved += LocalCyclesSaved; if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) report_fatal_error("RegBank reassign stall cycles verification failed."); Candidates.pop_back(); if (LocalCyclesSaved) { removeCandidates(C.Reg); computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); Candidates.sort(); LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; for (auto C : Candidates) C.dump(this); dbgs() << "\n\n"); } } NumStallsRecovered += CyclesSaved; LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved << " cycles saved in function " << MF.getName() << '\n'); Candidates.clear(); if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) report_fatal_error("RegBank reassign stall cycles verification failed."); RegsUsed.clear(); return CyclesSaved > 0; }