1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU). Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0 // This instruction is an optional
36 /// // optimization which allows us to
37 /// // branch if all the bits of
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1 // Use our branch optimization
45 /// // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/LivePhysRegs.h"
56 #include "llvm/CodeGen/MachineFrameInfo.h"
57 #include "llvm/CodeGen/MachineFunction.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/CodeGen/MachineInstrBuilder.h"
60 #include "llvm/CodeGen/MachineRegisterInfo.h"
64 #define DEBUG_TYPE "si-lower-control-flow"
68 class SILowerControlFlow : public MachineFunctionPass {
70 const SIRegisterInfo *TRI;
71 const SIInstrInfo *TII;
73 MachineRegisterInfo *MRI;
75 void emitIf(MachineInstr &MI);
76 void emitElse(MachineInstr &MI);
77 void emitBreak(MachineInstr &MI);
78 void emitIfBreak(MachineInstr &MI);
79 void emitElseBreak(MachineInstr &MI);
80 void emitLoop(MachineInstr &MI);
81 void emitEndCf(MachineInstr &MI);
83 void findMaskOperands(MachineInstr &MI, unsigned OpNo,
84 SmallVectorImpl<MachineOperand> &Src) const;
86 void combineMasks(MachineInstr &MI);
91 SILowerControlFlow() :
92 MachineFunctionPass(ID),
98 bool runOnMachineFunction(MachineFunction &MF) override;
100 StringRef getPassName() const override {
101 return "SI Lower control flow pseudo instructions";
104 void getAnalysisUsage(AnalysisUsage &AU) const override {
105 // Should preserve the same set that TwoAddressInstructions does.
106 AU.addPreserved<SlotIndexes>();
107 AU.addPreserved<LiveIntervals>();
108 AU.addPreservedID(LiveVariablesID);
109 AU.addPreservedID(MachineLoopInfoID);
110 AU.addPreservedID(MachineDominatorsID);
111 AU.setPreservesCFG();
112 MachineFunctionPass::getAnalysisUsage(AU);
116 } // End anonymous namespace
118 char SILowerControlFlow::ID = 0;
120 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
121 "SI lower control flow", false, false)
123 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
124 MachineOperand &ImpDefSCC = MI.getOperand(3);
125 assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
127 ImpDefSCC.setIsDead(IsDead);
130 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
132 void SILowerControlFlow::emitIf(MachineInstr &MI) {
133 MachineBasicBlock &MBB = *MI.getParent();
134 const DebugLoc &DL = MI.getDebugLoc();
135 MachineBasicBlock::iterator I(&MI);
137 MachineOperand &SaveExec = MI.getOperand(0);
138 MachineOperand &Cond = MI.getOperand(1);
139 assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
140 Cond.getSubReg() == AMDGPU::NoSubRegister);
142 unsigned SaveExecReg = SaveExec.getReg();
144 MachineOperand &ImpDefSCC = MI.getOperand(4);
145 assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
147 // Add an implicit def of exec to discourage scheduling VALU after this which
148 // will interfere with trying to form s_and_saveexec_b64 later.
149 unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
150 MachineInstr *CopyExec =
151 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
152 .addReg(AMDGPU::EXEC)
153 .addReg(AMDGPU::EXEC, RegState::ImplicitDefine);
155 unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
158 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp)
160 //.addReg(AMDGPU::EXEC)
161 .addReg(Cond.getReg());
162 setImpSCCDefDead(*And, true);
165 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
168 setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
170 // Use a copy that is a terminator to get correct spill code placement it with
172 MachineInstr *SetExec =
173 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC)
174 .addReg(Tmp, RegState::Kill);
176 // Insert a pseudo terminator to help keep the verifier happy. This will also
177 // be used later when inserting skips.
178 MachineInstr *NewBr =
179 BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
180 .addOperand(MI.getOperand(2));
183 MI.eraseFromParent();
187 LIS->InsertMachineInstrInMaps(*CopyExec);
189 // Replace with and so we don't need to fix the live interval for condition
191 LIS->ReplaceMachineInstrInMaps(MI, *And);
193 LIS->InsertMachineInstrInMaps(*Xor);
194 LIS->InsertMachineInstrInMaps(*SetExec);
195 LIS->InsertMachineInstrInMaps(*NewBr);
197 LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
198 MI.eraseFromParent();
200 // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
201 // hard to add another def here but I'm not sure how to correctly update the
203 LIS->removeInterval(SaveExecReg);
204 LIS->createAndComputeVirtRegInterval(SaveExecReg);
205 LIS->createAndComputeVirtRegInterval(Tmp);
206 LIS->createAndComputeVirtRegInterval(CopyReg);
209 void SILowerControlFlow::emitElse(MachineInstr &MI) {
210 MachineBasicBlock &MBB = *MI.getParent();
211 const DebugLoc &DL = MI.getDebugLoc();
213 unsigned DstReg = MI.getOperand(0).getReg();
214 assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
216 bool ExecModified = MI.getOperand(3).getImm() != 0;
217 MachineBasicBlock::iterator Start = MBB.begin();
219 // We are running before TwoAddressInstructions, and si_else's operands are
220 // tied. In order to correctly tie the registers, split this into a copy of
221 // the src like it does.
222 unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
223 BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
224 .addOperand(MI.getOperand(1)); // Saved EXEC
226 // This must be inserted before phis and any spill code inserted before the
228 unsigned SaveReg = ExecModified ?
229 MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg;
230 MachineInstr *OrSaveExec =
231 BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg)
234 MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
236 MachineBasicBlock::iterator ElsePt(MI);
240 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
241 .addReg(AMDGPU::EXEC)
245 LIS->InsertMachineInstrInMaps(*And);
249 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
250 .addReg(AMDGPU::EXEC)
253 MachineInstr *Branch =
254 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
258 MI.eraseFromParent();
262 LIS->RemoveMachineInstrFromMaps(MI);
263 MI.eraseFromParent();
265 LIS->InsertMachineInstrInMaps(*OrSaveExec);
267 LIS->InsertMachineInstrInMaps(*Xor);
268 LIS->InsertMachineInstrInMaps(*Branch);
270 // src reg is tied to dst reg.
271 LIS->removeInterval(DstReg);
272 LIS->createAndComputeVirtRegInterval(DstReg);
273 LIS->createAndComputeVirtRegInterval(CopyReg);
275 LIS->createAndComputeVirtRegInterval(SaveReg);
277 // Let this be recomputed.
278 LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
281 void SILowerControlFlow::emitBreak(MachineInstr &MI) {
282 MachineBasicBlock &MBB = *MI.getParent();
283 const DebugLoc &DL = MI.getDebugLoc();
284 unsigned Dst = MI.getOperand(0).getReg();
287 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
288 .addReg(AMDGPU::EXEC)
289 .addOperand(MI.getOperand(1));
292 LIS->ReplaceMachineInstrInMaps(MI, *Or);
293 MI.eraseFromParent();
296 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
297 MI.setDesc(TII->get(AMDGPU::S_OR_B64));
300 void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
301 MI.setDesc(TII->get(AMDGPU::S_OR_B64));
304 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
305 MachineBasicBlock &MBB = *MI.getParent();
306 const DebugLoc &DL = MI.getDebugLoc();
308 MachineInstr *AndN2 =
309 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
310 .addReg(AMDGPU::EXEC)
311 .addOperand(MI.getOperand(0));
313 MachineInstr *Branch =
314 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
315 .addOperand(MI.getOperand(1));
318 LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
319 LIS->InsertMachineInstrInMaps(*Branch);
322 MI.eraseFromParent();
325 void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
326 MachineBasicBlock &MBB = *MI.getParent();
327 const DebugLoc &DL = MI.getDebugLoc();
329 MachineBasicBlock::iterator InsPt = MBB.begin();
330 MachineInstr *NewMI =
331 BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
332 .addReg(AMDGPU::EXEC)
333 .addOperand(MI.getOperand(0));
336 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
338 MI.eraseFromParent();
341 LIS->handleMove(*NewMI);
344 // Returns replace operands for a logical operation, either single result
345 // for exec or two operands if source was another equivalent operation.
346 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
347 SmallVectorImpl<MachineOperand> &Src) const {
348 MachineOperand &Op = MI.getOperand(OpNo);
349 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
354 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
355 if (!Def || Def->getParent() != MI.getParent() ||
356 !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
359 // Make sure we do not modify exec between def and use.
360 // A copy with implcitly defined exec inserted earlier is an exclusion, it
361 // does not really modify exec.
362 for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
363 if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
364 !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
367 for (const auto &SrcOp : Def->explicit_operands())
368 if (SrcOp.isUse() && (!SrcOp.isReg() ||
369 TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
370 SrcOp.getReg() == AMDGPU::EXEC))
371 Src.push_back(SrcOp);
374 // Search and combine pairs of equivalent instructions, like
375 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
376 // S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
377 // One of the operands is exec mask.
378 void SILowerControlFlow::combineMasks(MachineInstr &MI) {
379 assert(MI.getNumExplicitOperands() == 3);
380 SmallVector<MachineOperand, 4> Ops;
381 unsigned OpToReplace = 1;
382 findMaskOperands(MI, 1, Ops);
383 if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
384 findMaskOperands(MI, 2, Ops);
385 if (Ops.size() != 3) return;
387 unsigned UniqueOpndIdx;
388 if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
389 else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
390 else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
393 unsigned Reg = MI.getOperand(OpToReplace).getReg();
394 MI.RemoveOperand(OpToReplace);
395 MI.addOperand(Ops[UniqueOpndIdx]);
396 if (MRI->use_empty(Reg))
397 MRI->getUniqueVRegDef(Reg)->eraseFromParent();
400 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
401 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
402 TII = ST.getInstrInfo();
403 TRI = &TII->getRegisterInfo();
405 // This doesn't actually need LiveIntervals, but we can preserve them.
406 LIS = getAnalysisIfAvailable<LiveIntervals>();
407 MRI = &MF.getRegInfo();
409 MachineFunction::iterator NextBB;
410 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
411 BI != BE; BI = NextBB) {
412 NextBB = std::next(BI);
413 MachineBasicBlock &MBB = *BI;
415 MachineBasicBlock::iterator I, Next, Last;
417 for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
419 MachineInstr &MI = *I;
421 switch (MI.getOpcode()) {
426 case AMDGPU::SI_ELSE:
430 case AMDGPU::SI_BREAK:
434 case AMDGPU::SI_IF_BREAK:
438 case AMDGPU::SI_ELSE_BREAK:
442 case AMDGPU::SI_LOOP:
446 case AMDGPU::SI_END_CF:
450 case AMDGPU::S_AND_B64:
451 case AMDGPU::S_OR_B64:
452 // Cleanup bit manipulations on exec mask
462 // Replay newly inserted code to combine masks
463 Next = (Last == MBB.end()) ? MBB.begin() : Last;