contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

   1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This pass adds instructions to enable whole quad mode for pixel
  11 /// shaders, and whole wavefront mode for all programs.
  12 ///
  13 /// Whole quad mode is required for derivative computations, but it interferes
  14 /// with shader side effects (stores and atomics). This pass is run on the
  15 /// scheduled machine IR but before register coalescing, so that machine SSA is
  16 /// available for analysis. It ensures that WQM is enabled when necessary, but
  17 /// disabled around stores and atomics.
  18 ///
  19 /// When necessary, this pass creates a function prolog
  20 ///
  21 ///   S_MOV_B64 LiveMask, EXEC
  22 ///   S_WQM_B64 EXEC, EXEC
  23 ///
  24 /// to enter WQM at the top of the function and surrounds blocks of Exact
  25 /// instructions by
  26 ///
  27 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
  28 ///   ...
  29 ///   S_MOV_B64 EXEC, Tmp
  30 ///
  31 /// We also compute when a sequence of instructions requires Whole Wavefront
  32 /// Mode (WWM) and insert instructions to save and restore it:
  33 ///
  34 /// S_OR_SAVEEXEC_B64 Tmp, -1
  35 /// ...
  36 /// S_MOV_B64 EXEC, Tmp
  37 ///
  38 /// In order to avoid excessive switching during sequences of Exact
  39 /// instructions, the pass first analyzes which instructions must be run in WQM
  40 /// (aka which instructions produce values that lead to derivative
  41 /// computations).
  42 ///
  43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
  44 ///
  45 /// There is room for improvement given better control flow analysis:
  46 ///
  47 ///  (1) at the top level (outside of control flow statements, and as long as
  48 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
  49 ///      the LiveMask (this is implemented for the entry block).
  50 ///
  51 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
  52 ///      consist of exact and don't-care instructions, the switch only has to
  53 ///      be done at the entry and exit points rather than potentially in each
  54 ///      block of the region.
  55 ///
  56 //===----------------------------------------------------------------------===//
  57
  58 #include "AMDGPU.h"
  59 #include "AMDGPUSubtarget.h"
  60 #include "SIInstrInfo.h"
  61 #include "SIMachineFunctionInfo.h"
  62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  63 #include "llvm/ADT/DenseMap.h"
  64 #include "llvm/ADT/PostOrderIterator.h"
  65 #include "llvm/ADT/SmallVector.h"
  66 #include "llvm/ADT/StringRef.h"
  67 #include "llvm/CodeGen/LiveInterval.h"
  68 #include "llvm/CodeGen/LiveIntervals.h"
  69 #include "llvm/CodeGen/MachineBasicBlock.h"
  70 #include "llvm/CodeGen/MachineFunction.h"
  71 #include "llvm/CodeGen/MachineFunctionPass.h"
  72 #include "llvm/CodeGen/MachineInstr.h"
  73 #include "llvm/CodeGen/MachineInstrBuilder.h"
  74 #include "llvm/CodeGen/MachineOperand.h"
  75 #include "llvm/CodeGen/MachineRegisterInfo.h"
  76 #include "llvm/CodeGen/SlotIndexes.h"
  77 #include "llvm/CodeGen/TargetRegisterInfo.h"
  78 #include "llvm/IR/CallingConv.h"
  79 #include "llvm/IR/DebugLoc.h"
  80 #include "llvm/MC/MCRegisterInfo.h"
  81 #include "llvm/Pass.h"
  82 #include "llvm/Support/Debug.h"
  83 #include "llvm/Support/raw_ostream.h"
  84 #include <cassert>
  85 #include <vector>
  86
  87 using namespace llvm;
  88
  89 #define DEBUG_TYPE "si-wqm"
  90
  91 namespace {
  92
  93 enum {
  94   StateWQM = 0x1,
  95   StateWWM = 0x2,
  96   StateExact = 0x4,
  97 };
  98
  99 struct PrintState {
 100 public:
 101   int State;
 102
 103   explicit PrintState(int State) : State(State) {}
 104 };
 105
 106 #ifndef NDEBUG
 107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
 108   if (PS.State & StateWQM)
 109     OS << "WQM";
 110   if (PS.State & StateWWM) {
 111     if (PS.State & StateWQM)
 112       OS << '|';
 113     OS << "WWM";
 114   }
 115   if (PS.State & StateExact) {
 116     if (PS.State & (StateWQM | StateWWM))
 117       OS << '|';
 118     OS << "Exact";
 119   }
 120
 121   return OS;
 122 }
 123 #endif
 124
 125 struct InstrInfo {
 126   char Needs = 0;
 127   char Disabled = 0;
 128   char OutNeeds = 0;
 129 };
 130
 131 struct BlockInfo {
 132   char Needs = 0;
 133   char InNeeds = 0;
 134   char OutNeeds = 0;
 135 };
 136
 137 struct WorkItem {
 138   MachineBasicBlock *MBB = nullptr;
 139   MachineInstr *MI = nullptr;
 140
 141   WorkItem() = default;
 142   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
 143   WorkItem(MachineInstr *MI) : MI(MI) {}
 144 };
 145
 146 class SIWholeQuadMode : public MachineFunctionPass {
 147 private:
 148   CallingConv::ID CallingConv;
 149   const SIInstrInfo *TII;
 150   const SIRegisterInfo *TRI;
 151   const GCNSubtarget *ST;
 152   MachineRegisterInfo *MRI;
 153   LiveIntervals *LIS;
 154
 155   DenseMap<const MachineInstr *, InstrInfo> Instructions;
 156   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
 157   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 158   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
 159
 160   void printInfo();
 161
 162   void markInstruction(MachineInstr &MI, char Flag,
 163                        std::vector<WorkItem> &Worklist);
 164   void markInstructionUses(const MachineInstr &MI, char Flag,
 165                            std::vector<WorkItem> &Worklist);
 166   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
 167   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
 168   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
 169   char analyzeFunction(MachineFunction &MF);
 170
 171   bool requiresCorrectState(const MachineInstr &MI) const;
 172
 173   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
 174                                       MachineBasicBlock::iterator Before);
 175   MachineBasicBlock::iterator
 176   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
 177                    MachineBasicBlock::iterator Last, bool PreferLast,
 178                    bool SaveSCC);
 179   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 180                unsigned SaveWQM, unsigned LiveMaskReg);
 181   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 182              unsigned SavedWQM);
 183   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 184              unsigned SaveOrig);
 185   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 186                unsigned SavedOrig);
 187   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
 188
 189   void lowerLiveMaskQueries(unsigned LiveMaskReg);
 190   void lowerCopyInstrs();
 191
 192 public:
 193   static char ID;
 194
 195   SIWholeQuadMode() :
 196     MachineFunctionPass(ID) { }
 197
 198   bool runOnMachineFunction(MachineFunction &MF) override;
 199
 200   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
 201
 202   void getAnalysisUsage(AnalysisUsage &AU) const override {
 203     AU.addRequired<LiveIntervals>();
 204     AU.addPreserved<SlotIndexes>();
 205     AU.addPreserved<LiveIntervals>();
 206     AU.setPreservesCFG();
 207     MachineFunctionPass::getAnalysisUsage(AU);
 208   }
 209 };
 210
 211 } // end anonymous namespace
 212
 213 char SIWholeQuadMode::ID = 0;
 214
 215 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
 216                       false)
 217 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 218 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
 219                     false)
 220
 221 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
 222
 223 FunctionPass *llvm::createSIWholeQuadModePass() {
 224   return new SIWholeQuadMode;
 225 }
 226
 227 #ifndef NDEBUG
 228 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
 229   for (const auto &BII : Blocks) {
 230     dbgs() << "\n"
 231            << printMBBReference(*BII.first) << ":\n"
 232            << "  InNeeds = " << PrintState(BII.second.InNeeds)
 233            << ", Needs = " << PrintState(BII.second.Needs)
 234            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
 235
 236     for (const MachineInstr &MI : *BII.first) {
 237       auto III = Instructions.find(&MI);
 238       if (III == Instructions.end())
 239         continue;
 240
 241       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
 242              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
 243     }
 244   }
 245 }
 246 #endif
 247
 248 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 249                                       std::vector<WorkItem> &Worklist) {
 250   InstrInfo &II = Instructions[&MI];
 251
 252   assert(!(Flag & StateExact) && Flag != 0);
 253
 254   // Remove any disabled states from the flag. The user that required it gets
 255   // an undefined value in the helper lanes. For example, this can happen if
 256   // the result of an atomic is used by instruction that requires WQM, where
 257   // ignoring the request for WQM is correct as per the relevant specs.
 258   Flag &= ~II.Disabled;
 259
 260   // Ignore if the flag is already encompassed by the existing needs, or we
 261   // just disabled everything.
 262   if ((II.Needs & Flag) == Flag)
 263     return;
 264
 265   II.Needs |= Flag;
 266   Worklist.push_back(&MI);
 267 }
 268
 269 /// Mark all instructions defining the uses in \p MI with \p Flag.
 270 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
 271                                           std::vector<WorkItem> &Worklist) {
 272   for (const MachineOperand &Use : MI.uses()) {
 273     if (!Use.isReg() || !Use.isUse())
 274       continue;
 275
 276     unsigned Reg = Use.getReg();
 277
 278     // Handle physical registers that we need to track; this is mostly relevant
 279     // for VCC, which can appear as the (implicit) input of a uniform branch,
 280     // e.g. when a loop counter is stored in a VGPR.
 281     if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
 282       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
 283         continue;
 284
 285       for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
 286         LiveRange &LR = LIS->getRegUnit(*RegUnit);
 287         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
 288         if (!Value)
 289           continue;
 290
 291         // Since we're in machine SSA, we do not need to track physical
 292         // registers across basic blocks.
 293         if (Value->isPHIDef())
 294           continue;
 295
 296         markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
 297                         Worklist);
 298       }
 299
 300       continue;
 301     }
 302
 303     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
 304       markInstruction(DefMI, Flag, Worklist);
 305   }
 306 }
 307
 308 // Scan instructions to determine which ones require an Exact execmask and
 309 // which ones seed WQM requirements.
 310 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 311                                        std::vector<WorkItem> &Worklist) {
 312   char GlobalFlags = 0;
 313   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
 314   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
 315
 316   // We need to visit the basic blocks in reverse post-order so that we visit
 317   // defs before uses, in particular so that we don't accidentally mark an
 318   // instruction as needing e.g. WQM before visiting it and realizing it needs
 319   // WQM disabled.
 320   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
 321   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
 322     MachineBasicBlock &MBB = **BI;
 323     BlockInfo &BBI = Blocks[&MBB];
 324
 325     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
 326       MachineInstr &MI = *II;
 327       InstrInfo &III = Instructions[&MI];
 328       unsigned Opcode = MI.getOpcode();
 329       char Flags = 0;
 330
 331       if (TII->isWQM(Opcode)) {
 332         // Sampling instructions don't need to produce results for all pixels
 333         // in a quad, they just require all inputs of a quad to have been
 334         // computed for derivatives.
 335         markInstructionUses(MI, StateWQM, Worklist);
 336         GlobalFlags |= StateWQM;
 337         continue;
 338       } else if (Opcode == AMDGPU::WQM) {
 339         // The WQM intrinsic requires its output to have all the helper lanes
 340         // correct, so we need it to be in WQM.
 341         Flags = StateWQM;
 342         LowerToCopyInstrs.push_back(&MI);
 343       } else if (Opcode == AMDGPU::WWM) {
 344         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
 345         // to be executed in WQM or Exact so that its copy doesn't clobber
 346         // inactive lanes.
 347         markInstructionUses(MI, StateWWM, Worklist);
 348         GlobalFlags |= StateWWM;
 349         LowerToCopyInstrs.push_back(&MI);
 350         continue;
 351       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
 352                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
 353         III.Disabled = StateWWM;
 354         MachineOperand &Inactive = MI.getOperand(2);
 355         if (Inactive.isReg()) {
 356           if (Inactive.isUndef()) {
 357             LowerToCopyInstrs.push_back(&MI);
 358           } else {
 359             unsigned Reg = Inactive.getReg();
 360             if (TargetRegisterInfo::isVirtualRegister(Reg)) {
 361               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
 362                 markInstruction(DefMI, StateWWM, Worklist);
 363             }
 364           }
 365         }
 366         SetInactiveInstrs.push_back(&MI);
 367         continue;
 368       } else if (TII->isDisableWQM(MI)) {
 369         BBI.Needs |= StateExact;
 370         if (!(BBI.InNeeds & StateExact)) {
 371           BBI.InNeeds |= StateExact;
 372           Worklist.push_back(&MBB);
 373         }
 374         GlobalFlags |= StateExact;
 375         III.Disabled = StateWQM | StateWWM;
 376         continue;
 377       } else {
 378         if (Opcode == AMDGPU::SI_PS_LIVE) {
 379           LiveMaskQueries.push_back(&MI);
 380         } else if (WQMOutputs) {
 381           // The function is in machine SSA form, which means that physical
 382           // VGPRs correspond to shader inputs and outputs. Inputs are
 383           // only used, outputs are only defined.
 384           for (const MachineOperand &MO : MI.defs()) {
 385             if (!MO.isReg())
 386               continue;
 387
 388             unsigned Reg = MO.getReg();
 389
 390             if (!TRI->isVirtualRegister(Reg) &&
 391                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
 392               Flags = StateWQM;
 393               break;
 394             }
 395           }
 396         }
 397
 398         if (!Flags)
 399           continue;
 400       }
 401
 402       markInstruction(MI, Flags, Worklist);
 403       GlobalFlags |= Flags;
 404     }
 405   }
 406
 407   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
 408   // ever used anywhere in the function. This implements the corresponding
 409   // semantics of @llvm.amdgcn.set.inactive.
 410   if (GlobalFlags & StateWQM) {
 411     for (MachineInstr *MI : SetInactiveInstrs)
 412       markInstruction(*MI, StateWQM, Worklist);
 413   }
 414
 415   return GlobalFlags;
 416 }
 417
 418 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
 419                                            std::vector<WorkItem>& Worklist) {
 420   MachineBasicBlock *MBB = MI.getParent();
 421   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
 422   BlockInfo &BI = Blocks[MBB];
 423
 424   // Control flow-type instructions and stores to temporary memory that are
 425   // followed by WQM computations must themselves be in WQM.
 426   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
 427       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
 428     Instructions[&MI].Needs = StateWQM;
 429     II.Needs = StateWQM;
 430   }
 431
 432   // Propagate to block level
 433   if (II.Needs & StateWQM) {
 434     BI.Needs |= StateWQM;
 435     if (!(BI.InNeeds & StateWQM)) {
 436       BI.InNeeds |= StateWQM;
 437       Worklist.push_back(MBB);
 438     }
 439   }
 440
 441   // Propagate backwards within block
 442   if (MachineInstr *PrevMI = MI.getPrevNode()) {
 443     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
 444     if (!PrevMI->isPHI()) {
 445       InstrInfo &PrevII = Instructions[PrevMI];
 446       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
 447         PrevII.OutNeeds |= InNeeds;
 448         Worklist.push_back(PrevMI);
 449       }
 450     }
 451   }
 452
 453   // Propagate WQM flag to instruction inputs
 454   assert(!(II.Needs & StateExact));
 455
 456   if (II.Needs != 0)
 457     markInstructionUses(MI, II.Needs, Worklist);
 458
 459   // Ensure we process a block containing WWM, even if it does not require any
 460   // WQM transitions.
 461   if (II.Needs & StateWWM)
 462     BI.Needs |= StateWWM;
 463 }
 464
 465 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
 466                                      std::vector<WorkItem>& Worklist) {
 467   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
 468
 469   // Propagate through instructions
 470   if (!MBB.empty()) {
 471     MachineInstr *LastMI = &*MBB.rbegin();
 472     InstrInfo &LastII = Instructions[LastMI];
 473     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
 474       LastII.OutNeeds |= BI.OutNeeds;
 475       Worklist.push_back(LastMI);
 476     }
 477   }
 478
 479   // Predecessor blocks must provide for our WQM/Exact needs.
 480   for (MachineBasicBlock *Pred : MBB.predecessors()) {
 481     BlockInfo &PredBI = Blocks[Pred];
 482     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
 483       continue;
 484
 485     PredBI.OutNeeds |= BI.InNeeds;
 486     PredBI.InNeeds |= BI.InNeeds;
 487     Worklist.push_back(Pred);
 488   }
 489
 490   // All successors must be prepared to accept the same set of WQM/Exact data.
 491   for (MachineBasicBlock *Succ : MBB.successors()) {
 492     BlockInfo &SuccBI = Blocks[Succ];
 493     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
 494       continue;
 495
 496     SuccBI.InNeeds |= BI.OutNeeds;
 497     Worklist.push_back(Succ);
 498   }
 499 }
 500
 501 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
 502   std::vector<WorkItem> Worklist;
 503   char GlobalFlags = scanInstructions(MF, Worklist);
 504
 505   while (!Worklist.empty()) {
 506     WorkItem WI = Worklist.back();
 507     Worklist.pop_back();
 508
 509     if (WI.MI)
 510       propagateInstruction(*WI.MI, Worklist);
 511     else
 512       propagateBlock(*WI.MBB, Worklist);
 513   }
 514
 515   return GlobalFlags;
 516 }
 517
 518 /// Whether \p MI really requires the exec state computed during analysis.
 519 ///
 520 /// Scalar instructions must occasionally be marked WQM for correct propagation
 521 /// (e.g. thread masks leading up to branches), but when it comes to actual
 522 /// execution, they don't care about EXEC.
 523 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
 524   if (MI.isTerminator())
 525     return true;
 526
 527   // Skip instructions that are not affected by EXEC
 528   if (TII->isScalarUnit(MI))
 529     return false;
 530
 531   // Generic instructions such as COPY will either disappear by register
 532   // coalescing or be lowered to SALU or VALU instructions.
 533   if (MI.isTransient()) {
 534     if (MI.getNumExplicitOperands() >= 1) {
 535       const MachineOperand &Op = MI.getOperand(0);
 536       if (Op.isReg()) {
 537         if (TRI->isSGPRReg(*MRI, Op.getReg())) {
 538           // SGPR instructions are not affected by EXEC
 539           return false;
 540         }
 541       }
 542     }
 543   }
 544
 545   return true;
 546 }
 547
 548 MachineBasicBlock::iterator
 549 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
 550                          MachineBasicBlock::iterator Before) {
 551   unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 552
 553   MachineInstr *Save =
 554       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
 555           .addReg(AMDGPU::SCC);
 556   MachineInstr *Restore =
 557       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
 558           .addReg(SaveReg);
 559
 560   LIS->InsertMachineInstrInMaps(*Save);
 561   LIS->InsertMachineInstrInMaps(*Restore);
 562   LIS->createAndComputeVirtRegInterval(SaveReg);
 563
 564   return Restore;
 565 }
 566
 567 // Return an iterator in the (inclusive) range [First, Last] at which
 568 // instructions can be safely inserted, keeping in mind that some of the
 569 // instructions we want to add necessarily clobber SCC.
 570 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
 571     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
 572     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
 573   if (!SaveSCC)
 574     return PreferLast ? Last : First;
 575
 576   LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
 577   auto MBBE = MBB.end();
 578   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
 579                                      : LIS->getMBBEndIdx(&MBB);
 580   SlotIndex LastIdx =
 581       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
 582   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
 583   const LiveRange::Segment *S;
 584
 585   for (;;) {
 586     S = LR.getSegmentContaining(Idx);
 587     if (!S)
 588       break;
 589
 590     if (PreferLast) {
 591       SlotIndex Next = S->start.getBaseIndex();
 592       if (Next < FirstIdx)
 593         break;
 594       Idx = Next;
 595     } else {
 596       SlotIndex Next = S->end.getNextIndex().getBaseIndex();
 597       if (Next > LastIdx)
 598         break;
 599       Idx = Next;
 600     }
 601   }
 602
 603   MachineBasicBlock::iterator MBBI;
 604
 605   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
 606     MBBI = MI;
 607   else {
 608     assert(Idx == LIS->getMBBEndIdx(&MBB));
 609     MBBI = MBB.end();
 610   }
 611
 612   if (S)
 613     MBBI = saveSCC(MBB, MBBI);
 614
 615   return MBBI;
 616 }
 617
 618 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
 619                               MachineBasicBlock::iterator Before,
 620                               unsigned SaveWQM, unsigned LiveMaskReg) {
 621   MachineInstr *MI;
 622
 623   if (SaveWQM) {
 624     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
 625                    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
 626                  SaveWQM)
 627              .addReg(LiveMaskReg);
 628   } else {
 629     unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 630     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
 631                    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
 632                  Exec)
 633              .addReg(Exec)
 634              .addReg(LiveMaskReg);
 635   }
 636
 637   LIS->InsertMachineInstrInMaps(*MI);
 638 }
 639
 640 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
 641                             MachineBasicBlock::iterator Before,
 642                             unsigned SavedWQM) {
 643   MachineInstr *MI;
 644
 645   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 646   if (SavedWQM) {
 647     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
 648              .addReg(SavedWQM);
 649   } else {
 650     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
 651                    AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
 652                  Exec)
 653              .addReg(Exec);
 654   }
 655
 656   LIS->InsertMachineInstrInMaps(*MI);
 657 }
 658
 659 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
 660                             MachineBasicBlock::iterator Before,
 661                             unsigned SaveOrig) {
 662   MachineInstr *MI;
 663
 664   assert(SaveOrig);
 665   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
 666            .addImm(-1);
 667   LIS->InsertMachineInstrInMaps(*MI);
 668 }
 669
 670 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
 671                               MachineBasicBlock::iterator Before,
 672                               unsigned SavedOrig) {
 673   MachineInstr *MI;
 674
 675   assert(SavedOrig);
 676   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
 677                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
 678            .addReg(SavedOrig);
 679   LIS->InsertMachineInstrInMaps(*MI);
 680 }
 681
 682 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 683                                    bool isEntry) {
 684   auto BII = Blocks.find(&MBB);
 685   if (BII == Blocks.end())
 686     return;
 687
 688   const BlockInfo &BI = BII->second;
 689
 690   // This is a non-entry block that is WQM throughout, so no need to do
 691   // anything.
 692   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
 693     return;
 694
 695   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
 696                     << ":\n");
 697
 698   unsigned SavedWQMReg = 0;
 699   unsigned SavedNonWWMReg = 0;
 700   bool WQMFromExec = isEntry;
 701   char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
 702   char NonWWMState = 0;
 703   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
 704
 705   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
 706   if (isEntry)
 707     ++II; // Skip the instruction that saves LiveMask
 708
 709   // This stores the first instruction where it's safe to switch from WQM to
 710   // Exact or vice versa.
 711   MachineBasicBlock::iterator FirstWQM = IE;
 712
 713   // This stores the first instruction where it's safe to switch from WWM to
 714   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
 715   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
 716   // switch to/from WQM as well.
 717   MachineBasicBlock::iterator FirstWWM = IE;
 718   for (;;) {
 719     MachineBasicBlock::iterator Next = II;
 720     char Needs = StateExact | StateWQM; // WWM is disabled by default
 721     char OutNeeds = 0;
 722
 723     if (FirstWQM == IE)
 724       FirstWQM = II;
 725
 726     if (FirstWWM == IE)
 727       FirstWWM = II;
 728
 729     // First, figure out the allowed states (Needs) based on the propagated
 730     // flags.
 731     if (II != IE) {
 732       MachineInstr &MI = *II;
 733
 734       if (requiresCorrectState(MI)) {
 735         auto III = Instructions.find(&MI);
 736         if (III != Instructions.end()) {
 737           if (III->second.Needs & StateWWM)
 738             Needs = StateWWM;
 739           else if (III->second.Needs & StateWQM)
 740             Needs = StateWQM;
 741           else
 742             Needs &= ~III->second.Disabled;
 743           OutNeeds = III->second.OutNeeds;
 744         }
 745       } else {
 746         // If the instruction doesn't actually need a correct EXEC, then we can
 747         // safely leave WWM enabled.
 748         Needs = StateExact | StateWQM | StateWWM;
 749       }
 750
 751       if (MI.isTerminator() && OutNeeds == StateExact)
 752         Needs = StateExact;
 753
 754       if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
 755         MI.getOperand(3).setImm(1);
 756
 757       ++Next;
 758     } else {
 759       // End of basic block
 760       if (BI.OutNeeds & StateWQM)
 761         Needs = StateWQM;
 762       else if (BI.OutNeeds == StateExact)
 763         Needs = StateExact;
 764       else
 765         Needs = StateWQM | StateExact;
 766     }
 767
 768     // Now, transition if necessary.
 769     if (!(Needs & State)) {
 770       MachineBasicBlock::iterator First;
 771       if (State == StateWWM || Needs == StateWWM) {
 772         // We must switch to or from WWM
 773         First = FirstWWM;
 774       } else {
 775         // We only need to switch to/from WQM, so we can use FirstWQM
 776         First = FirstWQM;
 777       }
 778
 779       MachineBasicBlock::iterator Before =
 780           prepareInsertion(MBB, First, II, Needs == StateWQM,
 781                            Needs == StateExact || WQMFromExec);
 782
 783       if (State == StateWWM) {
 784         assert(SavedNonWWMReg);
 785         fromWWM(MBB, Before, SavedNonWWMReg);
 786         State = NonWWMState;
 787       }
 788
 789       if (Needs == StateWWM) {
 790         NonWWMState = State;
 791         SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
 792         toWWM(MBB, Before, SavedNonWWMReg);
 793         State = StateWWM;
 794       } else {
 795         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
 796           if (!WQMFromExec && (OutNeeds & StateWQM))
 797             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
 798
 799           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
 800           State = StateExact;
 801         } else if (State == StateExact && (Needs & StateWQM) &&
 802                    !(Needs & StateExact)) {
 803           assert(WQMFromExec == (SavedWQMReg == 0));
 804
 805           toWQM(MBB, Before, SavedWQMReg);
 806
 807           if (SavedWQMReg) {
 808             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
 809             SavedWQMReg = 0;
 810           }
 811           State = StateWQM;
 812         } else {
 813           // We can get here if we transitioned from WWM to a non-WWM state that
 814           // already matches our needs, but we shouldn't need to do anything.
 815           assert(Needs & State);
 816         }
 817       }
 818     }
 819
 820     if (Needs != (StateExact | StateWQM | StateWWM)) {
 821       if (Needs != (StateExact | StateWQM))
 822         FirstWQM = IE;
 823       FirstWWM = IE;
 824     }
 825
 826     if (II == IE)
 827       break;
 828     II = Next;
 829   }
 830 }
 831
 832 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
 833   for (MachineInstr *MI : LiveMaskQueries) {
 834     const DebugLoc &DL = MI->getDebugLoc();
 835     unsigned Dest = MI->getOperand(0).getReg();
 836     MachineInstr *Copy =
 837         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
 838             .addReg(LiveMaskReg);
 839
 840     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
 841     MI->eraseFromParent();
 842   }
 843 }
 844
 845 void SIWholeQuadMode::lowerCopyInstrs() {
 846   for (MachineInstr *MI : LowerToCopyInstrs) {
 847     for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
 848       MI->RemoveOperand(i);
 849
 850     const unsigned Reg = MI->getOperand(0).getReg();
 851
 852     if (TRI->isVGPR(*MRI, Reg)) {
 853       const TargetRegisterClass *regClass =
 854           TargetRegisterInfo::isVirtualRegister(Reg)
 855               ? MRI->getRegClass(Reg)
 856               : TRI->getPhysRegClass(Reg);
 857
 858       const unsigned MovOp = TII->getMovOpcode(regClass);
 859       MI->setDesc(TII->get(MovOp));
 860
 861       // And make it implicitly depend on exec (like all VALU movs should do).
 862       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
 863     } else {
 864       MI->setDesc(TII->get(AMDGPU::COPY));
 865     }
 866   }
 867 }
 868
 869 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 870   Instructions.clear();
 871   Blocks.clear();
 872   LiveMaskQueries.clear();
 873   LowerToCopyInstrs.clear();
 874   CallingConv = MF.getFunction().getCallingConv();
 875
 876   ST = &MF.getSubtarget<GCNSubtarget>();
 877
 878   TII = ST->getInstrInfo();
 879   TRI = &TII->getRegisterInfo();
 880   MRI = &MF.getRegInfo();
 881   LIS = &getAnalysis<LiveIntervals>();
 882
 883   char GlobalFlags = analyzeFunction(MF);
 884   unsigned LiveMaskReg = 0;
 885   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 886   if (!(GlobalFlags & StateWQM)) {
 887     lowerLiveMaskQueries(Exec);
 888     if (!(GlobalFlags & StateWWM))
 889       return !LiveMaskQueries.empty();
 890   } else {
 891     // Store a copy of the original live mask when required
 892     MachineBasicBlock &Entry = MF.front();
 893     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
 894
 895     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
 896       LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
 897       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
 898                                  TII->get(AMDGPU::COPY), LiveMaskReg)
 899                              .addReg(Exec);
 900       LIS->InsertMachineInstrInMaps(*MI);
 901     }
 902
 903     lowerLiveMaskQueries(LiveMaskReg);
 904
 905     if (GlobalFlags == StateWQM) {
 906       // For a shader that needs only WQM, we can just set it once.
 907       BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
 908                 AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
 909               Exec)
 910           .addReg(Exec);
 911
 912       lowerCopyInstrs();
 913       // EntryMI may become invalid here
 914       return true;
 915     }
 916   }
 917
 918   LLVM_DEBUG(printInfo());
 919
 920   lowerCopyInstrs();
 921
 922   // Handle the general case
 923   for (auto BII : Blocks)
 924     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
 925
 926   // Physical registers like SCC aren't tracked by default anyway, so just
 927   // removing the ranges we computed is the simplest option for maintaining
 928   // the analysis results.
 929   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
 930
 931   return true;
 932 }