1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This pass inserts branches on the 0 exec mask over divergent branches
12 /// branches when it's expected that jumping over the untaken control flow will
13 /// be cheaper than having every workitem no-op through it.
15 //===----------------------------------------------------------------------===//
18 #include "AMDGPUSubtarget.h"
19 #include "SIInstrInfo.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Target/TargetMachine.h"
42 #define DEBUG_TYPE "si-insert-skips"
44 static cl::opt<unsigned> SkipThresholdFlag(
45 "amdgpu-skip-threshold",
46 cl::desc("Number of instructions before jumping over divergent control flow"),
47 cl::init(12), cl::Hidden);
51 class SIInsertSkips : public MachineFunctionPass {
53 const SIRegisterInfo *TRI = nullptr;
54 const SIInstrInfo *TII = nullptr;
55 unsigned SkipThreshold = 0;
57 bool shouldSkip(const MachineBasicBlock &From,
58 const MachineBasicBlock &To) const;
60 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
62 void kill(MachineInstr &MI);
64 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
65 MachineBasicBlock::iterator I) const;
67 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
72 SIInsertSkips() : MachineFunctionPass(ID) {}
74 bool runOnMachineFunction(MachineFunction &MF) override;
76 StringRef getPassName() const override {
77 return "SI insert s_cbranch_execz instructions";
80 void getAnalysisUsage(AnalysisUsage &AU) const override {
81 MachineFunctionPass::getAnalysisUsage(AU);
85 } // end anonymous namespace
87 char SIInsertSkips::ID = 0;
89 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
90 "SI insert s_cbranch_execz instructions", false, false)
92 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
94 static bool opcodeEmitsNoInsts(unsigned Opc) {
96 case TargetOpcode::IMPLICIT_DEF:
97 case TargetOpcode::KILL:
98 case TargetOpcode::BUNDLE:
99 case TargetOpcode::CFI_INSTRUCTION:
100 case TargetOpcode::EH_LABEL:
101 case TargetOpcode::GC_LABEL:
102 case TargetOpcode::DBG_VALUE:
109 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
110 const MachineBasicBlock &To) const {
111 if (From.succ_empty())
114 unsigned NumInstr = 0;
115 const MachineFunction *MF = From.getParent();
117 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
118 MBBI != End && MBBI != ToI; ++MBBI) {
119 const MachineBasicBlock &MBB = *MBBI;
121 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
122 NumInstr < SkipThreshold && I != E; ++I) {
123 if (opcodeEmitsNoInsts(I->getOpcode()))
126 // FIXME: Since this is required for correctness, this should be inserted
127 // during SILowerControlFlow.
129 // When a uniform loop is inside non-uniform control flow, the branch
130 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
131 // when EXEC = 0. We should skip the loop lest it becomes infinite.
132 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
133 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
136 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
140 if (NumInstr >= SkipThreshold)
148 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
149 MachineBasicBlock &MBB = *MI.getParent();
150 MachineFunction *MF = MBB.getParent();
152 if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
153 !shouldSkip(MBB, MBB.getParent()->back()))
156 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
158 const DebugLoc &DL = MI.getDebugLoc();
160 // If the exec mask is non-zero, skip the next two instructions
161 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
164 MachineBasicBlock::iterator Insert = SkipBB->begin();
166 // Exec mask is zero: Export to NULL target...
167 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
168 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
169 .addReg(AMDGPU::VGPR0, RegState::Undef)
170 .addReg(AMDGPU::VGPR0, RegState::Undef)
171 .addReg(AMDGPU::VGPR0, RegState::Undef)
172 .addReg(AMDGPU::VGPR0, RegState::Undef)
177 // ... and terminate wavefront.
178 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
183 void SIInsertSkips::kill(MachineInstr &MI) {
184 MachineBasicBlock &MBB = *MI.getParent();
185 DebugLoc DL = MI.getDebugLoc();
187 switch (MI.getOpcode()) {
188 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
191 // The opcodes are inverted because the inline immediate has to be
192 // the first operand, e.g. from "x < imm" to "imm > x"
193 switch (MI.getOperand(2).getImm()) {
196 Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
200 Opcode = AMDGPU::V_CMPX_LT_F32_e64;
204 Opcode = AMDGPU::V_CMPX_LE_F32_e64;
208 Opcode = AMDGPU::V_CMPX_GT_F32_e64;
212 Opcode = AMDGPU::V_CMPX_GE_F32_e64;
216 Opcode = AMDGPU::V_CMPX_LG_F32_e64;
219 Opcode = AMDGPU::V_CMPX_O_F32_e64;
222 Opcode = AMDGPU::V_CMPX_U_F32_e64;
225 Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
228 Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
231 Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
234 Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
237 Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
240 Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
243 llvm_unreachable("invalid ISD:SET cond code");
246 assert(MI.getOperand(0).isReg());
248 if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
249 MI.getOperand(0).getReg())) {
250 Opcode = AMDGPU::getVOPe32(Opcode);
251 BuildMI(MBB, &MI, DL, TII->get(Opcode))
252 .add(MI.getOperand(1))
253 .add(MI.getOperand(0));
255 BuildMI(MBB, &MI, DL, TII->get(Opcode))
256 .addReg(AMDGPU::VCC, RegState::Define)
257 .addImm(0) // src0 modifiers
258 .add(MI.getOperand(1))
259 .addImm(0) // src1 modifiers
260 .add(MI.getOperand(0))
265 case AMDGPU::SI_KILL_I1_TERMINATOR: {
266 const MachineOperand &Op = MI.getOperand(0);
267 int64_t KillVal = MI.getOperand(1).getImm();
268 assert(KillVal == 0 || KillVal == -1);
270 // Kill all threads if Op0 is an immediate and equal to the Kill value.
272 int64_t Imm = Op.getImm();
273 assert(Imm == 0 || Imm == -1);
276 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
281 unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
282 BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
283 .addReg(AMDGPU::EXEC)
288 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
292 MachineBasicBlock *SIInsertSkips::insertSkipBlock(
293 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
294 MachineFunction *MF = MBB.getParent();
296 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
297 MachineFunction::iterator MBBI(MBB);
300 MF->insert(MBBI, SkipBB);
301 MBB.addSuccessor(SkipBB);
306 // Returns true if a branch over the block was inserted.
307 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
308 MachineBasicBlock &SrcMBB) {
309 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
311 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
314 const DebugLoc &DL = MI.getDebugLoc();
315 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
317 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
323 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
324 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
325 TII = ST.getInstrInfo();
326 TRI = &TII->getRegisterInfo();
327 SkipThreshold = SkipThresholdFlag;
329 bool HaveKill = false;
330 bool MadeChange = false;
332 // Track depth of exec mask, divergent branches.
333 SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
335 MachineFunction::iterator NextBB;
337 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
339 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
340 BI != BE; BI = NextBB) {
341 NextBB = std::next(BI);
342 MachineBasicBlock &MBB = *BI;
343 bool HaveSkipBlock = false;
345 if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
346 // Reached convergence point for last divergent branch.
347 ExecBranchStack.pop_back();
350 if (HaveKill && ExecBranchStack.empty()) {
353 // TODO: Insert skip if exec is 0?
356 MachineBasicBlock::iterator I, Next;
357 for (I = MBB.begin(); I != MBB.end(); I = Next) {
360 MachineInstr &MI = *I;
362 switch (MI.getOpcode()) {
363 case AMDGPU::SI_MASK_BRANCH:
364 ExecBranchStack.push_back(MI.getOperand(0).getMBB());
365 MadeChange |= skipMaskBranch(MI, MBB);
368 case AMDGPU::S_BRANCH:
369 // Optimize out branches to the next block.
370 // FIXME: Shouldn't this be handled by BranchFolding?
371 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
372 MI.eraseFromParent();
373 } else if (HaveSkipBlock) {
374 // Remove the given unconditional branch when a skip block has been
375 // inserted after the current one and let skip the two instructions
376 // performing the kill if the exec mask is non-zero.
377 MI.eraseFromParent();
381 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
382 case AMDGPU::SI_KILL_I1_TERMINATOR:
386 if (ExecBranchStack.empty()) {
387 if (skipIfDead(MI, *NextBB)) {
388 HaveSkipBlock = true;
389 NextBB = std::next(BI);
396 MI.eraseFromParent();
399 case AMDGPU::SI_RETURN_TO_EPILOG:
400 // FIXME: Should move somewhere else
401 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
403 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
404 // because external bytecode will be appended at the end.
405 if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
406 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
407 // the end and jump there.
408 if (!EmptyMBBAtEnd) {
409 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
410 MF.insert(MF.end(), EmptyMBBAtEnd);
413 MBB.addSuccessor(EmptyMBBAtEnd);
414 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
415 .addMBB(EmptyMBBAtEnd);
416 I->eraseFromParent();