1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU). Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0 // This instruction is an optional
36 /// // optimization which allows us to
37 /// // branch if all the bits of
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1 // Use our branch optimization
45 /// // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
52 #include "SIInstrInfo.h"
53 #include "SIMachineFunctionInfo.h"
54 #include "llvm/CodeGen/MachineFunction.h"
55 #include "llvm/CodeGen/MachineFunctionPass.h"
56 #include "llvm/CodeGen/MachineInstrBuilder.h"
57 #include "llvm/CodeGen/MachineRegisterInfo.h"
63 class SILowerControlFlowPass : public MachineFunctionPass {
66 static const unsigned SkipThreshold = 12;
69 const TargetRegisterInfo *TRI;
70 const TargetInstrInfo *TII;
72 bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
74 void Skip(MachineInstr &From, MachineOperand &To);
75 void SkipIfDead(MachineInstr &MI);
77 void If(MachineInstr &MI);
78 void Else(MachineInstr &MI);
79 void Break(MachineInstr &MI);
80 void IfBreak(MachineInstr &MI);
81 void ElseBreak(MachineInstr &MI);
82 void Loop(MachineInstr &MI);
83 void EndCf(MachineInstr &MI);
85 void Kill(MachineInstr &MI);
86 void Branch(MachineInstr &MI);
88 void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
89 void IndirectSrc(MachineInstr &MI);
90 void IndirectDst(MachineInstr &MI);
93 SILowerControlFlowPass(TargetMachine &tm) :
94 MachineFunctionPass(ID), TRI(0), TII(0) { }
96 virtual bool runOnMachineFunction(MachineFunction &MF);
98 const char *getPassName() const {
99 return "SI Lower control flow instructions";
104 } // End anonymous namespace
106 char SILowerControlFlowPass::ID = 0;
108 FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
109 return new SILowerControlFlowPass(tm);
112 static bool isDS(unsigned Opcode) {
114 default: return false;
115 case AMDGPU::DS_ADD_U32_RTN:
116 case AMDGPU::DS_SUB_U32_RTN:
117 case AMDGPU::DS_WRITE_B32:
118 case AMDGPU::DS_WRITE_B8:
119 case AMDGPU::DS_WRITE_B16:
120 case AMDGPU::DS_READ_B32:
121 case AMDGPU::DS_READ_I8:
122 case AMDGPU::DS_READ_U8:
123 case AMDGPU::DS_READ_I16:
124 case AMDGPU::DS_READ_U16:
129 bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
130 MachineBasicBlock *To) {
132 unsigned NumInstr = 0;
134 for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
135 MBB = *MBB->succ_begin()) {
137 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
138 NumInstr < SkipThreshold && I != E; ++I) {
140 if (I->isBundle() || !I->isBundled())
141 if (++NumInstr >= SkipThreshold)
149 void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
151 if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
154 DebugLoc DL = From.getDebugLoc();
155 BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
157 .addReg(AMDGPU::EXEC);
160 void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
162 MachineBasicBlock &MBB = *MI.getParent();
163 DebugLoc DL = MI.getDebugLoc();
165 if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType !=
167 !shouldSkip(&MBB, &MBB.getParent()->back()))
170 MachineBasicBlock::iterator Insert = &MI;
173 // If the exec mask is non-zero, skip the next two instructions
174 BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
176 .addReg(AMDGPU::EXEC);
178 // Exec mask is zero: Export to NULL target...
179 BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
181 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
185 .addReg(AMDGPU::VGPR0)
186 .addReg(AMDGPU::VGPR0)
187 .addReg(AMDGPU::VGPR0)
188 .addReg(AMDGPU::VGPR0);
190 // ... and terminate wavefront
191 BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
194 void SILowerControlFlowPass::If(MachineInstr &MI) {
195 MachineBasicBlock &MBB = *MI.getParent();
196 DebugLoc DL = MI.getDebugLoc();
197 unsigned Reg = MI.getOperand(0).getReg();
198 unsigned Vcc = MI.getOperand(1).getReg();
200 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
203 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
204 .addReg(AMDGPU::EXEC)
207 Skip(MI, MI.getOperand(2));
209 MI.eraseFromParent();
212 void SILowerControlFlowPass::Else(MachineInstr &MI) {
213 MachineBasicBlock &MBB = *MI.getParent();
214 DebugLoc DL = MI.getDebugLoc();
215 unsigned Dst = MI.getOperand(0).getReg();
216 unsigned Src = MI.getOperand(1).getReg();
218 BuildMI(MBB, MBB.getFirstNonPHI(), DL,
219 TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
220 .addReg(Src); // Saved EXEC
222 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
223 .addReg(AMDGPU::EXEC)
226 Skip(MI, MI.getOperand(2));
228 MI.eraseFromParent();
231 void SILowerControlFlowPass::Break(MachineInstr &MI) {
232 MachineBasicBlock &MBB = *MI.getParent();
233 DebugLoc DL = MI.getDebugLoc();
235 unsigned Dst = MI.getOperand(0).getReg();
236 unsigned Src = MI.getOperand(1).getReg();
238 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
239 .addReg(AMDGPU::EXEC)
242 MI.eraseFromParent();
245 void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
246 MachineBasicBlock &MBB = *MI.getParent();
247 DebugLoc DL = MI.getDebugLoc();
249 unsigned Dst = MI.getOperand(0).getReg();
250 unsigned Vcc = MI.getOperand(1).getReg();
251 unsigned Src = MI.getOperand(2).getReg();
253 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
257 MI.eraseFromParent();
260 void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
261 MachineBasicBlock &MBB = *MI.getParent();
262 DebugLoc DL = MI.getDebugLoc();
264 unsigned Dst = MI.getOperand(0).getReg();
265 unsigned Saved = MI.getOperand(1).getReg();
266 unsigned Src = MI.getOperand(2).getReg();
268 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
272 MI.eraseFromParent();
275 void SILowerControlFlowPass::Loop(MachineInstr &MI) {
276 MachineBasicBlock &MBB = *MI.getParent();
277 DebugLoc DL = MI.getDebugLoc();
278 unsigned Src = MI.getOperand(0).getReg();
280 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
281 .addReg(AMDGPU::EXEC)
284 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
285 .addOperand(MI.getOperand(1))
286 .addReg(AMDGPU::EXEC);
288 MI.eraseFromParent();
291 void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
292 MachineBasicBlock &MBB = *MI.getParent();
293 DebugLoc DL = MI.getDebugLoc();
294 unsigned Reg = MI.getOperand(0).getReg();
296 BuildMI(MBB, MBB.getFirstNonPHI(), DL,
297 TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
298 .addReg(AMDGPU::EXEC)
301 MI.eraseFromParent();
304 void SILowerControlFlowPass::Branch(MachineInstr &MI) {
305 MachineBasicBlock *Next = MI.getParent()->getNextNode();
306 MachineBasicBlock *Target = MI.getOperand(0).getMBB();
308 MI.eraseFromParent();
313 void SILowerControlFlowPass::Kill(MachineInstr &MI) {
315 MachineBasicBlock &MBB = *MI.getParent();
316 DebugLoc DL = MI.getDebugLoc();
318 // Kill is only allowed in pixel / geometry shaders
319 assert(MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
321 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
322 ShaderType::GEOMETRY);
324 // Clear this pixel from the exec mask if the operand is negative
325 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
327 .addOperand(MI.getOperand(0));
329 MI.eraseFromParent();
332 void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
334 MachineBasicBlock &MBB = *MI.getParent();
335 DebugLoc DL = MI.getDebugLoc();
336 MachineBasicBlock::iterator I = MI;
338 unsigned Save = MI.getOperand(1).getReg();
339 unsigned Idx = MI.getOperand(3).getReg();
341 if (AMDGPU::SReg_32RegClass.contains(Idx)) {
342 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
344 MBB.insert(I, MovRel);
345 MI.eraseFromParent();
349 assert(AMDGPU::SReg_64RegClass.contains(Save));
350 assert(AMDGPU::VReg_32RegClass.contains(Idx));
352 // Save the EXEC mask
353 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
354 .addReg(AMDGPU::EXEC);
356 // Read the next variant into VCC (lower 32 bits) <- also loop target
357 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC)
360 // Move index from VCC into M0
361 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
362 .addReg(AMDGPU::VCC);
364 // Compare the just read M0 value to all possible Idx values
365 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
369 // Update EXEC, save the original EXEC value to VCC
370 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
371 .addReg(AMDGPU::VCC);
373 // Do the actual move
374 MBB.insert(I, MovRel);
376 // Update EXEC, switch all done bits to 0 and all todo bits to 1
377 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
378 .addReg(AMDGPU::EXEC)
379 .addReg(AMDGPU::VCC);
381 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
382 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
384 .addReg(AMDGPU::EXEC);
387 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
390 MI.eraseFromParent();
393 void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
395 MachineBasicBlock &MBB = *MI.getParent();
396 DebugLoc DL = MI.getDebugLoc();
398 unsigned Dst = MI.getOperand(0).getReg();
399 unsigned Vec = MI.getOperand(2).getReg();
400 unsigned Off = MI.getOperand(4).getImm();
401 unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
405 MachineInstr *MovRel =
406 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
407 .addReg(SubReg + Off)
408 .addReg(AMDGPU::M0, RegState::Implicit)
409 .addReg(Vec, RegState::Implicit);
414 void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
416 MachineBasicBlock &MBB = *MI.getParent();
417 DebugLoc DL = MI.getDebugLoc();
419 unsigned Dst = MI.getOperand(0).getReg();
420 unsigned Off = MI.getOperand(4).getImm();
421 unsigned Val = MI.getOperand(5).getReg();
422 unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
426 MachineInstr *MovRel =
427 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
428 .addReg(SubReg + Off, RegState::Define)
430 .addReg(AMDGPU::M0, RegState::Implicit)
431 .addReg(Dst, RegState::Implicit);
436 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
437 TII = MF.getTarget().getInstrInfo();
438 TRI = MF.getTarget().getRegisterInfo();
439 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
441 bool HaveKill = false;
443 bool NeedWQM = false;
446 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
449 MachineBasicBlock &MBB = *BI;
450 for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
451 I != MBB.end(); I = Next) {
453 Next = llvm::next(I);
454 MachineInstr &MI = *I;
455 if (isDS(MI.getOpcode())) {
460 switch (MI.getOpcode()) {
467 case AMDGPU::SI_ELSE:
471 case AMDGPU::SI_BREAK:
475 case AMDGPU::SI_IF_BREAK:
479 case AMDGPU::SI_ELSE_BREAK:
483 case AMDGPU::SI_LOOP:
488 case AMDGPU::SI_END_CF:
489 if (--Depth == 0 && HaveKill) {
496 case AMDGPU::SI_KILL:
504 case AMDGPU::S_BRANCH:
508 case AMDGPU::SI_INDIRECT_SRC:
512 case AMDGPU::SI_INDIRECT_DST_V1:
513 case AMDGPU::SI_INDIRECT_DST_V2:
514 case AMDGPU::SI_INDIRECT_DST_V4:
515 case AMDGPU::SI_INDIRECT_DST_V8:
516 case AMDGPU::SI_INDIRECT_DST_V16:
520 case AMDGPU::V_INTERP_P1_F32:
521 case AMDGPU::V_INTERP_P2_F32:
522 case AMDGPU::V_INTERP_MOV_F32:
531 MachineBasicBlock &MBB = MF.front();
532 // Initialize M0 to a value that won't cause LDS access to be discarded
533 // due to offset clamping
534 BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
535 AMDGPU::M0).addImm(0xffffffff);
538 if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) {
539 MachineBasicBlock &MBB = MF.front();
540 BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
541 AMDGPU::EXEC).addReg(AMDGPU::EXEC);