contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

   1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief This pass lowers the pseudo control flow instructions to real
  12 /// machine instructions.
  13 ///
  14 /// All control flow is handled using predicated instructions and
  15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
  16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
  17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
  18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
  19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
  20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
  21 /// EXEC to update the predicates.
  22 ///
  23 /// For example:
  24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
  25 /// %SGPR0 = SI_IF %VCC
  26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
  27 /// %SGPR0 = SI_ELSE %SGPR0
  28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
  29 /// SI_END_CF %SGPR0
  30 ///
  31 /// becomes:
  32 ///
  33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
  34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
  35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
  36 ///                                   // optimization which allows us to
  37 ///                                   // branch if all the bits of
  38 ///                                   // EXEC are zero.
  39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
  40 ///
  41 /// label0:
  42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
  43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
  44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
  45 ///                                    // instruction again.
  46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
  47 /// label1:
  48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
  49 //===----------------------------------------------------------------------===//
  50
  51 #include "AMDGPU.h"
  52 #include "AMDGPUSubtarget.h"
  53 #include "SIInstrInfo.h"
  54 #include "SIMachineFunctionInfo.h"
  55 #include "llvm/CodeGen/LivePhysRegs.h"
  56 #include "llvm/CodeGen/MachineFrameInfo.h"
  57 #include "llvm/CodeGen/MachineFunction.h"
  58 #include "llvm/CodeGen/MachineFunctionPass.h"
  59 #include "llvm/CodeGen/MachineInstrBuilder.h"
  60 #include "llvm/CodeGen/MachineRegisterInfo.h"
  61 #include "llvm/IR/Constants.h"
  62
  63 using namespace llvm;
  64
  65 #define DEBUG_TYPE "si-lower-control-flow"
  66
  67 namespace {
  68
  69 class SILowerControlFlow : public MachineFunctionPass {
  70 private:
  71   static const unsigned SkipThreshold = 12;
  72
  73   const SIRegisterInfo *TRI;
  74   const SIInstrInfo *TII;
  75
  76   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
  77
  78   void Skip(MachineInstr &From, MachineOperand &To);
  79   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
  80
  81   void If(MachineInstr &MI);
  82   void Else(MachineInstr &MI, bool ExecModified);
  83   void Break(MachineInstr &MI);
  84   void IfBreak(MachineInstr &MI);
  85   void ElseBreak(MachineInstr &MI);
  86   void Loop(MachineInstr &MI);
  87   void EndCf(MachineInstr &MI);
  88
  89   void Kill(MachineInstr &MI);
  90   void Branch(MachineInstr &MI);
  91
  92   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
  93                                      MachineBasicBlock::iterator I) const;
  94
  95   std::pair<MachineBasicBlock *, MachineBasicBlock *>
  96   splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
  97
  98   void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
  99                                const MachineRegisterInfo &MRI,
 100                                const MachineInstr &MI,
 101                                MachineBasicBlock &LoopBB,
 102                                MachineBasicBlock &RemainderBB,
 103                                unsigned SaveReg,
 104                                const MachineOperand &IdxReg);
 105
 106   void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
 107                               MachineInstr *MovRel,
 108                               const MachineOperand &IdxReg,
 109                               int Offset);
 110
 111   bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
 112   std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
 113                                                        int Offset) const;
 114   bool indirectSrc(MachineInstr &MI);
 115   bool indirectDst(MachineInstr &MI);
 116
 117 public:
 118   static char ID;
 119
 120   SILowerControlFlow() :
 121     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
 122
 123   bool runOnMachineFunction(MachineFunction &MF) override;
 124
 125   const char *getPassName() const override {
 126     return "SI Lower control flow pseudo instructions";
 127   }
 128 };
 129
 130 } // End anonymous namespace
 131
 132 char SILowerControlFlow::ID = 0;
 133
 134 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
 135                 "SI lower control flow", false, false)
 136
 137 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
 138
 139
 140 FunctionPass *llvm::createSILowerControlFlowPass() {
 141   return new SILowerControlFlow();
 142 }
 143
 144 static bool opcodeEmitsNoInsts(unsigned Opc) {
 145   switch (Opc) {
 146   case TargetOpcode::IMPLICIT_DEF:
 147   case TargetOpcode::KILL:
 148   case TargetOpcode::BUNDLE:
 149   case TargetOpcode::CFI_INSTRUCTION:
 150   case TargetOpcode::EH_LABEL:
 151   case TargetOpcode::GC_LABEL:
 152   case TargetOpcode::DBG_VALUE:
 153     return true;
 154   default:
 155     return false;
 156   }
 157 }
 158
 159 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
 160                                     MachineBasicBlock *To) {
 161   if (From->succ_empty())
 162     return false;
 163
 164   unsigned NumInstr = 0;
 165   MachineFunction *MF = From->getParent();
 166
 167   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
 168        MBBI != End && MBBI != ToI; ++MBBI) {
 169     MachineBasicBlock &MBB = *MBBI;
 170
 171     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 172          NumInstr < SkipThreshold && I != E; ++I) {
 173       if (opcodeEmitsNoInsts(I->getOpcode()))
 174         continue;
 175
 176       // When a uniform loop is inside non-uniform control flow, the branch
 177       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
 178       // when EXEC = 0. We should skip the loop lest it becomes infinite.
 179       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
 180           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
 181         return true;
 182
 183       if (I->isInlineAsm()) {
 184         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
 185         const char *AsmStr = I->getOperand(0).getSymbolName();
 186
 187         // inlineasm length estimate is number of bytes assuming the longest
 188         // instruction.
 189         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
 190         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
 191       } else {
 192         ++NumInstr;
 193       }
 194
 195       if (NumInstr >= SkipThreshold)
 196         return true;
 197     }
 198   }
 199
 200   return false;
 201 }
 202
 203 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
 204
 205   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
 206     return;
 207
 208   DebugLoc DL = From.getDebugLoc();
 209   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
 210     .addOperand(To);
 211 }
 212
 213 bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
 214   MachineBasicBlock &MBB = *MI.getParent();
 215   MachineFunction *MF = MBB.getParent();
 216
 217   if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
 218       !shouldSkip(&MBB, &MBB.getParent()->back()))
 219     return false;
 220
 221   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
 222   MBB.addSuccessor(SkipBB);
 223
 224   const DebugLoc &DL = MI.getDebugLoc();
 225
 226   // If the exec mask is non-zero, skip the next two instructions
 227   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
 228     .addMBB(&NextBB);
 229
 230   MachineBasicBlock::iterator Insert = SkipBB->begin();
 231
 232   // Exec mask is zero: Export to NULL target...
 233   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
 234     .addImm(0)
 235     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
 236     .addImm(0)
 237     .addImm(1)
 238     .addImm(1)
 239     .addReg(AMDGPU::VGPR0, RegState::Undef)
 240     .addReg(AMDGPU::VGPR0, RegState::Undef)
 241     .addReg(AMDGPU::VGPR0, RegState::Undef)
 242     .addReg(AMDGPU::VGPR0, RegState::Undef);
 243
 244   // ... and terminate wavefront.
 245   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
 246
 247   return true;
 248 }
 249
 250 void SILowerControlFlow::If(MachineInstr &MI) {
 251   MachineBasicBlock &MBB = *MI.getParent();
 252   DebugLoc DL = MI.getDebugLoc();
 253   unsigned Reg = MI.getOperand(0).getReg();
 254   unsigned Vcc = MI.getOperand(1).getReg();
 255
 256   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
 257           .addReg(Vcc);
 258
 259   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
 260           .addReg(AMDGPU::EXEC)
 261           .addReg(Reg);
 262
 263   Skip(MI, MI.getOperand(2));
 264
 265   // Insert a pseudo terminator to help keep the verifier happy.
 266   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
 267     .addOperand(MI.getOperand(2))
 268     .addReg(Reg);
 269
 270   MI.eraseFromParent();
 271 }
 272
 273 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
 274   MachineBasicBlock &MBB = *MI.getParent();
 275   DebugLoc DL = MI.getDebugLoc();
 276   unsigned Dst = MI.getOperand(0).getReg();
 277   unsigned Src = MI.getOperand(1).getReg();
 278
 279   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
 280           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
 281           .addReg(Src); // Saved EXEC
 282
 283   if (ExecModified) {
 284     // Adjust the saved exec to account for the modifications during the flow
 285     // block that contains the ELSE. This can happen when WQM mode is switched
 286     // off.
 287     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
 288             .addReg(AMDGPU::EXEC)
 289             .addReg(Dst);
 290   }
 291
 292   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
 293           .addReg(AMDGPU::EXEC)
 294           .addReg(Dst);
 295
 296   Skip(MI, MI.getOperand(2));
 297
 298   // Insert a pseudo terminator to help keep the verifier happy.
 299   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
 300     .addOperand(MI.getOperand(2))
 301     .addReg(Dst);
 302
 303   MI.eraseFromParent();
 304 }
 305
 306 void SILowerControlFlow::Break(MachineInstr &MI) {
 307   MachineBasicBlock &MBB = *MI.getParent();
 308   DebugLoc DL = MI.getDebugLoc();
 309
 310   unsigned Dst = MI.getOperand(0).getReg();
 311   unsigned Src = MI.getOperand(1).getReg();
 312
 313   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
 314           .addReg(AMDGPU::EXEC)
 315           .addReg(Src);
 316
 317   MI.eraseFromParent();
 318 }
 319
 320 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
 321   MachineBasicBlock &MBB = *MI.getParent();
 322   DebugLoc DL = MI.getDebugLoc();
 323
 324   unsigned Dst = MI.getOperand(0).getReg();
 325   unsigned Vcc = MI.getOperand(1).getReg();
 326   unsigned Src = MI.getOperand(2).getReg();
 327
 328   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
 329           .addReg(Vcc)
 330           .addReg(Src);
 331
 332   MI.eraseFromParent();
 333 }
 334
 335 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
 336   MachineBasicBlock &MBB = *MI.getParent();
 337   DebugLoc DL = MI.getDebugLoc();
 338
 339   unsigned Dst = MI.getOperand(0).getReg();
 340   unsigned Saved = MI.getOperand(1).getReg();
 341   unsigned Src = MI.getOperand(2).getReg();
 342
 343   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
 344           .addReg(Saved)
 345           .addReg(Src);
 346
 347   MI.eraseFromParent();
 348 }
 349
 350 void SILowerControlFlow::Loop(MachineInstr &MI) {
 351   MachineBasicBlock &MBB = *MI.getParent();
 352   DebugLoc DL = MI.getDebugLoc();
 353   unsigned Src = MI.getOperand(0).getReg();
 354
 355   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
 356           .addReg(AMDGPU::EXEC)
 357           .addReg(Src);
 358
 359   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
 360     .addOperand(MI.getOperand(1));
 361
 362   MI.eraseFromParent();
 363 }
 364
 365 void SILowerControlFlow::EndCf(MachineInstr &MI) {
 366   MachineBasicBlock &MBB = *MI.getParent();
 367   DebugLoc DL = MI.getDebugLoc();
 368   unsigned Reg = MI.getOperand(0).getReg();
 369
 370   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
 371           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
 372           .addReg(AMDGPU::EXEC)
 373           .addReg(Reg);
 374
 375   MI.eraseFromParent();
 376 }
 377
 378 void SILowerControlFlow::Branch(MachineInstr &MI) {
 379   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
 380   if (MBB == MI.getParent()->getNextNode())
 381     MI.eraseFromParent();
 382
 383   // If these aren't equal, this is probably an infinite loop.
 384 }
 385
 386 void SILowerControlFlow::Kill(MachineInstr &MI) {
 387   MachineBasicBlock &MBB = *MI.getParent();
 388   DebugLoc DL = MI.getDebugLoc();
 389   const MachineOperand &Op = MI.getOperand(0);
 390
 391 #ifndef NDEBUG
 392   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
 393   // Kill is only allowed in pixel / geometry shaders.
 394   assert(CallConv == CallingConv::AMDGPU_PS ||
 395          CallConv == CallingConv::AMDGPU_GS);
 396 #endif
 397
 398   // Clear this thread from the exec mask if the operand is negative
 399   if ((Op.isImm())) {
 400     // Constant operand: Set exec mask to 0 or do nothing
 401     if (Op.getImm() & 0x80000000) {
 402       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
 403               .addImm(0);
 404     }
 405   } else {
 406     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
 407            .addImm(0)
 408            .addOperand(Op);
 409   }
 410
 411   MI.eraseFromParent();
 412 }
 413
 414 // All currently live registers must remain so in the remainder block.
 415 void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
 416                                                  const MachineRegisterInfo &MRI,
 417                                                  const MachineInstr &MI,
 418                                                  MachineBasicBlock &LoopBB,
 419                                                  MachineBasicBlock &RemainderBB,
 420                                                  unsigned SaveReg,
 421                                                  const MachineOperand &IdxReg) {
 422   // Add reg defined in loop body.
 423   RemainderLiveRegs.addReg(SaveReg);
 424
 425   if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
 426     if (!Val->isUndef()) {
 427       RemainderLiveRegs.addReg(Val->getReg());
 428       LoopBB.addLiveIn(Val->getReg());
 429     }
 430   }
 431
 432   for (unsigned Reg : RemainderLiveRegs) {
 433     if (MRI.isAllocatable(Reg))
 434       RemainderBB.addLiveIn(Reg);
 435   }
 436
 437   const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
 438   if (!Src->isUndef())
 439     LoopBB.addLiveIn(Src->getReg());
 440
 441   if (!IdxReg.isUndef())
 442     LoopBB.addLiveIn(IdxReg.getReg());
 443   LoopBB.sortUniqueLiveIns();
 444 }
 445
 446 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
 447                                                 DebugLoc DL,
 448                                                 MachineInstr *MovRel,
 449                                                 const MachineOperand &IdxReg,
 450                                                 int Offset) {
 451   MachineBasicBlock::iterator I = LoopBB.begin();
 452
 453   // Read the next variant into VCC (lower 32 bits) <- also loop target
 454   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
 455     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
 456
 457   // Move index from VCC into M0
 458   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
 459     .addReg(AMDGPU::VCC_LO);
 460
 461   // Compare the just read M0 value to all possible Idx values
 462   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
 463     .addReg(AMDGPU::M0)
 464     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
 465
 466   // Update EXEC, save the original EXEC value to VCC
 467   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
 468     .addReg(AMDGPU::VCC);
 469
 470   if (Offset != 0) {
 471     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
 472       .addReg(AMDGPU::M0)
 473       .addImm(Offset);
 474   }
 475
 476   // Do the actual move
 477   LoopBB.insert(I, MovRel);
 478
 479   // Update EXEC, switch all done bits to 0 and all todo bits to 1
 480   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
 481     .addReg(AMDGPU::EXEC)
 482     .addReg(AMDGPU::VCC);
 483
 484   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
 485   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
 486     .addMBB(&LoopBB);
 487 }
 488
 489 MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
 490   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
 491   MachineFunction *MF = MBB.getParent();
 492
 493   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
 494   MachineFunction::iterator MBBI(MBB);
 495   ++MBBI;
 496
 497   MF->insert(MBBI, SkipBB);
 498
 499   return SkipBB;
 500 }
 501
 502 std::pair<MachineBasicBlock *, MachineBasicBlock *>
 503 SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
 504                                MachineBasicBlock::iterator I) {
 505   MachineFunction *MF = MBB.getParent();
 506
 507   // To insert the loop we need to split the block. Move everything after this
 508   // point to a new block, and insert a new empty block between the two.
 509   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
 510   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
 511   MachineFunction::iterator MBBI(MBB);
 512   ++MBBI;
 513
 514   MF->insert(MBBI, LoopBB);
 515   MF->insert(MBBI, RemainderBB);
 516
 517   // Move the rest of the block into a new block.
 518   RemainderBB->transferSuccessors(&MBB);
 519   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
 520
 521   MBB.addSuccessor(LoopBB);
 522
 523   return std::make_pair(LoopBB, RemainderBB);
 524 }
 525
 526 // Returns true if a new block was inserted.
 527 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
 528   MachineBasicBlock &MBB = *MI.getParent();
 529   DebugLoc DL = MI.getDebugLoc();
 530   MachineBasicBlock::iterator I(&MI);
 531
 532   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
 533
 534   if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
 535     if (Offset != 0) {
 536       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
 537         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
 538         .addImm(Offset);
 539     } else {
 540       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
 541         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
 542     }
 543
 544     MBB.insert(I, MovRel);
 545     MI.eraseFromParent();
 546     return false;
 547   }
 548
 549   MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
 550   SaveOp->setIsDead(false);
 551   unsigned Save = SaveOp->getReg();
 552
 553   // Reading from a VGPR requires looping over all workitems in the wavefront.
 554   assert(AMDGPU::SReg_64RegClass.contains(Save) &&
 555          AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
 556
 557   // Save the EXEC mask
 558   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
 559     .addReg(AMDGPU::EXEC);
 560
 561   LivePhysRegs RemainderLiveRegs(TRI);
 562
 563   RemainderLiveRegs.addLiveOuts(MBB);
 564
 565   MachineBasicBlock *LoopBB;
 566   MachineBasicBlock *RemainderBB;
 567
 568   std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
 569
 570   for (const MachineInstr &Inst : reverse(*RemainderBB))
 571     RemainderLiveRegs.stepBackward(Inst);
 572
 573   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 574   LoopBB->addSuccessor(RemainderBB);
 575   LoopBB->addSuccessor(LoopBB);
 576
 577   splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
 578                           *RemainderBB, Save, *Idx);
 579
 580   emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
 581
 582   MachineBasicBlock::iterator First = RemainderBB->begin();
 583   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
 584     .addReg(Save);
 585
 586   MI.eraseFromParent();
 587   return true;
 588 }
 589
 590 /// \param @VecReg The register which holds element zero of the vector being
 591 ///                 addressed into.
 592 //
 593 /// \param[in] @Idx The index operand from the movrel instruction. This must be
 594 // a register, but may be NoRegister.
 595 ///
 596 /// \param[in] @Offset As an input, this is the constant offset part of the
 597 // indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
 598 // value that needs to be added to the value stored in M0.
 599 std::pair<unsigned, int>
 600 SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
 601   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
 602   if (!SubReg)
 603     SubReg = VecReg;
 604
 605   const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
 606   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
 607   int NumElts = SuperRC->getSize() / RC->getSize();
 608
 609   int BaseRegIdx = TRI->getHWRegIndex(SubReg);
 610
 611   // Skip out of bounds offsets, or else we would end up using an undefined
 612   // register.
 613   if (Offset >= NumElts)
 614     return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
 615
 616   int RegIdx = BaseRegIdx + Offset;
 617   if (RegIdx < 0) {
 618     Offset = RegIdx;
 619     RegIdx = 0;
 620   } else {
 621     Offset = 0;
 622   }
 623
 624   unsigned Reg = RC->getRegister(RegIdx);
 625   return std::make_pair(Reg, Offset);
 626 }
 627
 628 // Return true if a new block was inserted.
 629 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
 630   MachineBasicBlock &MBB = *MI.getParent();
 631   const DebugLoc &DL = MI.getDebugLoc();
 632
 633   unsigned Dst = MI.getOperand(0).getReg();
 634   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
 635   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
 636   unsigned Reg;
 637
 638   std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
 639
 640   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
 641   if (Idx->getReg() == AMDGPU::NoRegister) {
 642     // Only had a constant offset, copy the register directly.
 643     BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
 644       .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
 645     MI.eraseFromParent();
 646     return false;
 647   }
 648
 649   MachineInstr *MovRel =
 650     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
 651     .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
 652     .addReg(SrcVec->getReg(), RegState::Implicit);
 653
 654   return loadM0(MI, MovRel, Offset);
 655 }
 656
 657 // Return true if a new block was inserted.
 658 bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
 659   MachineBasicBlock &MBB = *MI.getParent();
 660   const DebugLoc &DL = MI.getDebugLoc();
 661
 662   unsigned Dst = MI.getOperand(0).getReg();
 663   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
 664   unsigned Reg;
 665
 666   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
 667   std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
 668
 669   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
 670   if (Idx->getReg() == AMDGPU::NoRegister) {
 671     // Only had a constant offset, copy the register directly.
 672     BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
 673       .addOperand(*Val);
 674     MI.eraseFromParent();
 675     return false;
 676   }
 677
 678   MachineInstr *MovRel =
 679     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
 680     .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
 681     .addReg(Dst, RegState::Implicit);
 682
 683   return loadM0(MI, MovRel, Offset);
 684 }
 685
 686 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
 687   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 688   TII = ST.getInstrInfo();
 689   TRI = &TII->getRegisterInfo();
 690
 691   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 692
 693   bool HaveKill = false;
 694   bool NeedFlat = false;
 695   unsigned Depth = 0;
 696
 697   MachineFunction::iterator NextBB;
 698
 699   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 700        BI != BE; BI = NextBB) {
 701     NextBB = std::next(BI);
 702     MachineBasicBlock &MBB = *BI;
 703
 704     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
 705     MachineBasicBlock::iterator I, Next;
 706     bool ExecModified = false;
 707
 708     for (I = MBB.begin(); I != MBB.end(); I = Next) {
 709       Next = std::next(I);
 710
 711       MachineInstr &MI = *I;
 712
 713       // Flat uses m0 in case it needs to access LDS.
 714       if (TII->isFLAT(MI))
 715         NeedFlat = true;
 716
 717       if (I->modifiesRegister(AMDGPU::EXEC, TRI))
 718         ExecModified = true;
 719
 720       switch (MI.getOpcode()) {
 721         default: break;
 722         case AMDGPU::SI_IF:
 723           ++Depth;
 724           If(MI);
 725           break;
 726
 727         case AMDGPU::SI_ELSE:
 728           Else(MI, ExecModified);
 729           break;
 730
 731         case AMDGPU::SI_BREAK:
 732           Break(MI);
 733           break;
 734
 735         case AMDGPU::SI_IF_BREAK:
 736           IfBreak(MI);
 737           break;
 738
 739         case AMDGPU::SI_ELSE_BREAK:
 740           ElseBreak(MI);
 741           break;
 742
 743         case AMDGPU::SI_LOOP:
 744           ++Depth;
 745           Loop(MI);
 746           break;
 747
 748         case AMDGPU::SI_END_CF:
 749           if (--Depth == 0 && HaveKill) {
 750             HaveKill = false;
 751             // TODO: Insert skip if exec is 0?
 752           }
 753
 754           EndCf(MI);
 755           break;
 756
 757         case AMDGPU::SI_KILL_TERMINATOR:
 758           if (Depth == 0) {
 759             if (skipIfDead(MI, *NextBB)) {
 760               NextBB = std::next(BI);
 761               BE = MF.end();
 762             }
 763           } else
 764             HaveKill = true;
 765           Kill(MI);
 766           break;
 767
 768         case AMDGPU::S_BRANCH:
 769           Branch(MI);
 770           break;
 771
 772         case AMDGPU::SI_INDIRECT_SRC_V1:
 773         case AMDGPU::SI_INDIRECT_SRC_V2:
 774         case AMDGPU::SI_INDIRECT_SRC_V4:
 775         case AMDGPU::SI_INDIRECT_SRC_V8:
 776         case AMDGPU::SI_INDIRECT_SRC_V16:
 777           if (indirectSrc(MI)) {
 778             // The block was split at this point. We can safely skip the middle
 779             // inserted block to the following which contains the rest of this
 780             // block's instructions.
 781             NextBB = std::next(BI);
 782             BE = MF.end();
 783             Next = MBB.end();
 784           }
 785
 786           break;
 787
 788         case AMDGPU::SI_INDIRECT_DST_V1:
 789         case AMDGPU::SI_INDIRECT_DST_V2:
 790         case AMDGPU::SI_INDIRECT_DST_V4:
 791         case AMDGPU::SI_INDIRECT_DST_V8:
 792         case AMDGPU::SI_INDIRECT_DST_V16:
 793           if (indirectDst(MI)) {
 794             // The block was split at this point. We can safely skip the middle
 795             // inserted block to the following which contains the rest of this
 796             // block's instructions.
 797             NextBB = std::next(BI);
 798             BE = MF.end();
 799             Next = MBB.end();
 800           }
 801
 802           break;
 803
 804         case AMDGPU::SI_RETURN: {
 805           assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
 806
 807           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
 808           // because external bytecode will be appended at the end.
 809           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
 810             // SI_RETURN is not the last instruction. Add an empty block at
 811             // the end and jump there.
 812             if (!EmptyMBBAtEnd) {
 813               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
 814               MF.insert(MF.end(), EmptyMBBAtEnd);
 815             }
 816
 817             MBB.addSuccessor(EmptyMBBAtEnd);
 818             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
 819                     .addMBB(EmptyMBBAtEnd);
 820             I->eraseFromParent();
 821           }
 822           break;
 823         }
 824       }
 825     }
 826   }
 827
 828   if (NeedFlat && MFI->IsKernel) {
 829     // TODO: What to use with function calls?
 830     // We will need to Initialize the flat scratch register pair.
 831     if (NeedFlat)
 832       MFI->setHasFlatInstructions(true);
 833   }
 834
 835   return true;
 836 }