1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass inserts branches on the 0 exec mask over divergent branches
11 /// branches when it's expected that jumping over the untaken control flow will
12 /// be cheaper than having every workitem no-op through it.
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "llvm/ADT/DepthFirstIterator.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
28 #include "llvm/CodeGen/MachineInstr.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 #include "llvm/CodeGen/MachineOperand.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/InitializePasses.h"
34 #include "llvm/MC/MCAsmInfo.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Target/TargetMachine.h"
44 #define DEBUG_TYPE "si-insert-skips"
46 static cl::opt<unsigned> SkipThresholdFlag(
47 "amdgpu-skip-threshold-legacy",
48 cl::desc("Number of instructions before jumping over divergent control flow"),
49 cl::init(12), cl::Hidden);
53 class SIInsertSkips : public MachineFunctionPass {
55 const SIRegisterInfo *TRI = nullptr;
56 const SIInstrInfo *TII = nullptr;
57 unsigned SkipThreshold = 0;
58 MachineDominatorTree *MDT = nullptr;
60 MachineBasicBlock *EarlyExitBlock = nullptr;
62 bool shouldSkip(const MachineBasicBlock &From,
63 const MachineBasicBlock &To) const;
65 bool dominatesAllReachable(MachineBasicBlock &MBB);
66 void createEarlyExitBlock(MachineBasicBlock &MBB);
67 void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
70 bool kill(MachineInstr &MI);
72 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
77 SIInsertSkips() : MachineFunctionPass(ID) {}
79 bool runOnMachineFunction(MachineFunction &MF) override;
81 StringRef getPassName() const override {
82 return "SI insert s_cbranch_execz instructions";
85 void getAnalysisUsage(AnalysisUsage &AU) const override {
86 AU.addRequired<MachineDominatorTree>();
87 AU.addPreserved<MachineDominatorTree>();
88 MachineFunctionPass::getAnalysisUsage(AU);
92 } // end anonymous namespace
94 char SIInsertSkips::ID = 0;
96 INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
97 "SI insert s_cbranch_execz instructions", false, false)
98 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
99 INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
100 "SI insert s_cbranch_execz instructions", false, false)
102 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
104 static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
105 if (MI.isMetaInstruction())
108 // Handle target specific opcodes.
109 switch (MI.getOpcode()) {
110 case AMDGPU::SI_MASK_BRANCH:
117 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
118 const MachineBasicBlock &To) const {
119 unsigned NumInstr = 0;
120 const MachineFunction *MF = From.getParent();
122 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
123 MBBI != End && MBBI != ToI; ++MBBI) {
124 const MachineBasicBlock &MBB = *MBBI;
126 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
127 NumInstr < SkipThreshold && I != E; ++I) {
128 if (opcodeEmitsNoInsts(*I))
131 // FIXME: Since this is required for correctness, this should be inserted
132 // during SILowerControlFlow.
134 // When a uniform loop is inside non-uniform control flow, the branch
135 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
136 // when EXEC = 0. We should skip the loop lest it becomes infinite.
137 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
138 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
141 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
144 // These instructions are potentially expensive even if EXEC = 0.
145 if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
146 I->getOpcode() == AMDGPU::S_WAITCNT)
150 if (NumInstr >= SkipThreshold)
158 /// Check whether \p MBB dominates all blocks that are reachable from it.
159 bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
160 for (MachineBasicBlock *Other : depth_first(&MBB)) {
161 if (!MDT->dominates(&MBB, Other))
167 static void generatePsEndPgm(MachineBasicBlock &MBB,
168 MachineBasicBlock::iterator I, DebugLoc DL,
169 const SIInstrInfo *TII) {
170 // Generate "null export; s_endpgm".
171 BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
172 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
173 .addReg(AMDGPU::VGPR0, RegState::Undef)
174 .addReg(AMDGPU::VGPR0, RegState::Undef)
175 .addReg(AMDGPU::VGPR0, RegState::Undef)
176 .addReg(AMDGPU::VGPR0, RegState::Undef)
180 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
183 void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
184 MachineFunction *MF = MBB.getParent();
187 assert(!EarlyExitBlock);
188 EarlyExitBlock = MF->CreateMachineBasicBlock();
189 MF->insert(MF->end(), EarlyExitBlock);
191 generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
194 /// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
195 /// iterator. Only applies to pixel shaders.
196 void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
197 MachineBasicBlock::iterator I, DebugLoc DL) {
198 MachineFunction *MF = MBB.getParent();
199 assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
201 // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
202 // basic block that has no further successors (e.g., there was an
203 // `unreachable` there in IR). This can happen with original source of the
206 // if (uniform_condition) {
207 // write_to_memory();
211 // In this case, we write the "null_export; s_endpgm" skip code in the
212 // already-existing basic block.
213 auto NextBBI = std::next(MBB.getIterator());
214 bool NoSuccessor = I == MBB.end() &&
215 llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
218 generatePsEndPgm(MBB, I, DL, TII);
220 if (!EarlyExitBlock) {
221 createEarlyExitBlock(MBB);
222 // Update next block pointer to reflect any new blocks
223 NextBBI = std::next(MBB.getIterator());
226 auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
227 .addMBB(EarlyExitBlock);
229 // Split the block if the branch will not come at the end.
230 auto Next = std::next(BranchMI->getIterator());
231 if (Next != MBB.end() && !Next->isTerminator()) {
232 MachineBasicBlock *SplitBB =
233 MF->CreateMachineBasicBlock(MBB.getBasicBlock());
234 MF->insert(NextBBI, SplitBB);
235 SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end());
236 SplitBB->transferSuccessorsAndUpdatePHIs(&MBB);
237 // FIXME: the expectation is that this will be used near the beginning
238 // of a block so just assume all registers are still live.
239 for (auto LiveIn : MBB.liveins())
240 SplitBB->addLiveIn(LiveIn);
241 MBB.addSuccessor(SplitBB);
243 // Update dominator tree
244 using DomTreeT = DomTreeBase<MachineBasicBlock>;
245 SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
246 for (MachineBasicBlock *Succ : SplitBB->successors()) {
247 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
248 DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
250 DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
251 MDT->getBase().applyUpdates(DTUpdates);
254 MBB.addSuccessor(EarlyExitBlock);
255 MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
259 /// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
260 /// Return true unless the terminator is a no-op.
261 bool SIInsertSkips::kill(MachineInstr &MI) {
262 MachineBasicBlock &MBB = *MI.getParent();
263 DebugLoc DL = MI.getDebugLoc();
265 switch (MI.getOpcode()) {
266 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
269 // The opcodes are inverted because the inline immediate has to be
270 // the first operand, e.g. from "x < imm" to "imm > x"
271 switch (MI.getOperand(2).getImm()) {
274 Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
278 Opcode = AMDGPU::V_CMPX_LT_F32_e64;
282 Opcode = AMDGPU::V_CMPX_LE_F32_e64;
286 Opcode = AMDGPU::V_CMPX_GT_F32_e64;
290 Opcode = AMDGPU::V_CMPX_GE_F32_e64;
294 Opcode = AMDGPU::V_CMPX_LG_F32_e64;
297 Opcode = AMDGPU::V_CMPX_O_F32_e64;
300 Opcode = AMDGPU::V_CMPX_U_F32_e64;
303 Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
306 Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
309 Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
312 Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
315 Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
318 Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
321 llvm_unreachable("invalid ISD:SET cond code");
324 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
325 if (ST.hasNoSdstCMPX())
326 Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
328 assert(MI.getOperand(0).isReg());
330 if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
331 MI.getOperand(0).getReg())) {
332 Opcode = AMDGPU::getVOPe32(Opcode);
333 BuildMI(MBB, &MI, DL, TII->get(Opcode))
334 .add(MI.getOperand(1))
335 .add(MI.getOperand(0));
337 auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
338 if (!ST.hasNoSdstCMPX())
339 I.addReg(AMDGPU::VCC, RegState::Define);
341 I.addImm(0) // src0 modifiers
342 .add(MI.getOperand(1))
343 .addImm(0) // src1 modifiers
344 .add(MI.getOperand(0));
350 case AMDGPU::SI_KILL_I1_TERMINATOR: {
351 const MachineFunction *MF = MI.getParent()->getParent();
352 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
353 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
354 const MachineOperand &Op = MI.getOperand(0);
355 int64_t KillVal = MI.getOperand(1).getImm();
356 assert(KillVal == 0 || KillVal == -1);
358 // Kill all threads if Op0 is an immediate and equal to the Kill value.
360 int64_t Imm = Op.getImm();
361 assert(Imm == 0 || Imm == -1);
363 if (Imm == KillVal) {
364 BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
365 : AMDGPU::S_MOV_B64), Exec)
372 unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
374 Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
375 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
381 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
385 // Returns true if a branch over the block was inserted.
386 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
387 MachineBasicBlock &SrcMBB) {
388 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
390 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
393 const DebugLoc &DL = MI.getDebugLoc();
394 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
396 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
402 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
403 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
404 TII = ST.getInstrInfo();
405 TRI = &TII->getRegisterInfo();
406 MDT = &getAnalysis<MachineDominatorTree>();
407 SkipThreshold = SkipThresholdFlag;
409 SmallVector<MachineInstr *, 4> KillInstrs;
410 bool MadeChange = false;
412 for (MachineBasicBlock &MBB : MF) {
413 MachineBasicBlock::iterator I, Next;
414 for (I = MBB.begin(); I != MBB.end(); I = Next) {
416 MachineInstr &MI = *I;
418 switch (MI.getOpcode()) {
419 case AMDGPU::SI_MASK_BRANCH:
420 MadeChange |= skipMaskBranch(MI, MBB);
423 case AMDGPU::S_BRANCH:
424 // Optimize out branches to the next block.
425 // FIXME: Shouldn't this be handled by BranchFolding?
426 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
427 assert(&MI == &MBB.back());
428 MI.eraseFromParent();
433 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
434 case AMDGPU::SI_KILL_I1_TERMINATOR: {
436 bool CanKill = kill(MI);
438 // Check if we can add an early "if exec=0 { end shader }".
440 // Note that we _always_ do this if it is correct, even if the kill
441 // happens fairly late in the shader, because the null export should
442 // generally still be cheaper than normal export(s).
444 // TODO: The dominatesAllReachable check is conservative: if the
445 // dominance is only missing due to _uniform_ branches, we could
446 // in fact insert the early-exit as well.
448 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
449 dominatesAllReachable(MBB)) {
450 // Mark the instruction for kill-if-dead insertion. We delay this
451 // change because it modifies the CFG.
452 KillInstrs.push_back(&MI);
454 MI.eraseFromParent();
459 case AMDGPU::SI_KILL_CLEANUP:
460 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
461 dominatesAllReachable(MBB)) {
462 KillInstrs.push_back(&MI);
464 MI.eraseFromParent();
474 for (MachineInstr *Kill : KillInstrs) {
475 skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
476 Kill->getDebugLoc());
477 Kill->eraseFromParent();
480 EarlyExitBlock = nullptr;