1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass performs the peephole optimizations before code emission.
12 //===----------------------------------------------------------------------===//
15 #include "AMDGPUSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/Support/CommandLine.h"
24 #define DEBUG_TYPE "si-pre-emit-peephole"
28 class SIPreEmitPeephole : public MachineFunctionPass {
30 const SIInstrInfo *TII = nullptr;
31 const SIRegisterInfo *TRI = nullptr;
33 bool optimizeVccBranch(MachineInstr &MI) const;
34 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
39 SIPreEmitPeephole() : MachineFunctionPass(ID) {
40 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
43 bool runOnMachineFunction(MachineFunction &MF) override;
46 } // End anonymous namespace.
48 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
49 "SI peephole optimizations", false, false)
51 char SIPreEmitPeephole::ID = 0;
53 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
55 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
58 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
62 // We end up with this pattern sometimes after basic block placement.
63 // It happens while combining a block which assigns -1 or 0 to a saved mask
64 // and another block which consumes that saved mask and then a branch.
66 MachineBasicBlock &MBB = *MI.getParent();
67 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
68 const bool IsWave32 = ST.isWave32();
69 const unsigned CondReg = TRI->getVCC();
70 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
71 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
72 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
74 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
76 bool ReadsCond = false;
77 unsigned Threshold = 5;
78 for (++A; A != E; ++A) {
81 if (A->modifiesRegister(ExecReg, TRI))
83 if (A->modifiesRegister(CondReg, TRI)) {
84 if (!A->definesRegister(CondReg, TRI) ||
85 (A->getOpcode() != And && A->getOpcode() != AndN2))
89 ReadsCond |= A->readsRegister(CondReg, TRI);
94 MachineOperand &Op1 = A->getOperand(1);
95 MachineOperand &Op2 = A->getOperand(2);
96 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
97 TII->commuteInstruction(*A);
100 if (Op1.getReg() != ExecReg)
102 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
105 int64_t MaskValue = 0;
109 auto M = std::next(A);
110 bool ReadsSreg = false;
111 for (; M != E; ++M) {
112 if (M->definesRegister(SReg, TRI))
114 if (M->modifiesRegister(SReg, TRI))
116 ReadsSreg |= M->readsRegister(SReg, TRI);
118 if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
119 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
121 MaskValue = M->getOperand(1).getImm();
122 // First if sreg is only used in the AND instruction fold the immediate
123 // into into the AND.
124 if (!ReadsSreg && Op2.isKill()) {
125 A->getOperand(2).ChangeToImmediate(MaskValue);
126 M->eraseFromParent();
128 } else if (Op2.isImm()) {
129 MaskValue = Op2.getImm();
131 llvm_unreachable("Op2 must be register or immediate");
134 // Invert mask for s_andn2
135 assert(MaskValue == 0 || MaskValue == -1);
136 if (A->getOpcode() == AndN2)
137 MaskValue = ~MaskValue;
139 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
140 MI.killsRegister(CondReg, TRI))
141 A->eraseFromParent();
143 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
144 if (SReg == ExecReg) {
145 // EXEC is updated directly
147 MI.eraseFromParent();
150 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
151 } else if (IsVCCZ && MaskValue == 0) {
152 // Will always branch
153 // Remove all succesors shadowed by new unconditional branch
154 MachineBasicBlock *Parent = MI.getParent();
155 SmallVector<MachineInstr *, 4> ToRemove;
157 for (MachineInstr &Term : Parent->terminators()) {
160 ToRemove.push_back(&Term);
162 Found = Term.isIdenticalTo(MI);
165 assert(Found && "conditional branch is not terminator");
166 for (auto BranchMI : ToRemove) {
167 MachineOperand &Dst = BranchMI->getOperand(0);
168 assert(Dst.isMBB() && "destination is not basic block");
169 Parent->removeSuccessor(Dst.getMBB());
170 BranchMI->eraseFromParent();
173 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
174 Parent->removeSuccessor(Succ);
177 // Rewrite to unconditional branch
178 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
179 } else if (!IsVCCZ && MaskValue == 0) {
181 MachineOperand &Dst = MI.getOperand(0);
182 assert(Dst.isMBB() && "destination is not basic block");
183 MI.getParent()->removeSuccessor(Dst.getMBB());
184 MI.eraseFromParent();
186 } else if (MaskValue == -1) {
187 // Depends only on EXEC
189 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
192 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
193 MI.addImplicitDefUseOperands(*MBB.getParent());
198 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
199 MachineInstr &MI) const {
200 MachineBasicBlock &MBB = *MI.getParent();
201 const MachineFunction &MF = *MBB.getParent();
202 const MachineRegisterInfo &MRI = MF.getRegInfo();
203 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
204 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
205 SmallVector<MachineInstr *, 4> ToRemove;
208 if (!MI.isIdenticalTo(First))
211 // Scan back to find an identical S_SET_GPR_IDX_ON
212 for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
213 E = MI.getIterator(); I != E; ++I) {
214 switch (I->getOpcode()) {
215 case AMDGPU::S_SET_GPR_IDX_MODE:
217 case AMDGPU::S_SET_GPR_IDX_OFF:
219 ToRemove.push_back(&*I);
222 if (I->modifiesRegister(AMDGPU::M0, TRI))
224 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
226 if (llvm::any_of(I->operands(),
227 [&MRI, this](const MachineOperand &MO) {
229 TRI->isVectorRegister(MRI, MO.getReg());
231 // The only exception allowed here is another indirect vector move
232 // with the same mode.
234 !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
235 I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
236 I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
242 MI.eraseFromParent();
243 for (MachineInstr *RI : ToRemove)
244 RI->eraseFromParent();
248 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
249 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
250 TII = ST.getInstrInfo();
251 TRI = &TII->getRegisterInfo();
252 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
253 bool Changed = false;
255 for (MachineBasicBlock &MBB : MF) {
256 MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
257 if (MBBE != MBB.end()) {
258 MachineInstr &MI = *MBBE;
259 switch (MI.getOpcode()) {
260 case AMDGPU::S_CBRANCH_VCCZ:
261 case AMDGPU::S_CBRANCH_VCCNZ:
262 Changed |= optimizeVccBranch(MI);
264 case AMDGPU::SI_RETURN_TO_EPILOG:
265 // FIXME: This is not an optimization and should be
266 // moved somewhere else.
267 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
269 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
270 // because external bytecode will be appended at the end.
271 if (&MBB != &MF.back() || &MI != &MBB.back()) {
272 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
273 // at the end and jump there.
274 if (!EmptyMBBAtEnd) {
275 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
276 MF.insert(MF.end(), EmptyMBBAtEnd);
279 MBB.addSuccessor(EmptyMBBAtEnd);
280 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
281 .addMBB(EmptyMBBAtEnd);
282 MI.eraseFromParent();
283 MBBE = MBB.getFirstTerminator();
291 if (!ST.hasVGPRIndexMode())
294 MachineInstr *SetGPRMI = nullptr;
295 const unsigned Threshold = 20;
297 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
298 // second is not needed. Do expensive checks in the optimizeSetGPR()
299 // and limit the distance to 20 instructions for compile time purposes.
300 for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
301 MachineInstr &MI = *MBBI;
304 if (Count == Threshold)
309 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
318 if (optimizeSetGPR(*SetGPRMI, MI))