contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

   1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// This pass adds instructions to enable whole quad mode for pixel
  12 /// shaders, and whole wavefront mode for all programs.
  13 ///
  14 /// Whole quad mode is required for derivative computations, but it interferes
  15 /// with shader side effects (stores and atomics). This pass is run on the
  16 /// scheduled machine IR but before register coalescing, so that machine SSA is
  17 /// available for analysis. It ensures that WQM is enabled when necessary, but
  18 /// disabled around stores and atomics.
  19 ///
  20 /// When necessary, this pass creates a function prolog
  21 ///
  22 ///   S_MOV_B64 LiveMask, EXEC
  23 ///   S_WQM_B64 EXEC, EXEC
  24 ///
  25 /// to enter WQM at the top of the function and surrounds blocks of Exact
  26 /// instructions by
  27 ///
  28 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
  29 ///   ...
  30 ///   S_MOV_B64 EXEC, Tmp
  31 ///
  32 /// We also compute when a sequence of instructions requires Whole Wavefront
  33 /// Mode (WWM) and insert instructions to save and restore it:
  34 ///
  35 /// S_OR_SAVEEXEC_B64 Tmp, -1
  36 /// ...
  37 /// S_MOV_B64 EXEC, Tmp
  38 ///
  39 /// In order to avoid excessive switching during sequences of Exact
  40 /// instructions, the pass first analyzes which instructions must be run in WQM
  41 /// (aka which instructions produce values that lead to derivative
  42 /// computations).
  43 ///
  44 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
  45 ///
  46 /// There is room for improvement given better control flow analysis:
  47 ///
  48 ///  (1) at the top level (outside of control flow statements, and as long as
  49 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
  50 ///      the LiveMask (this is implemented for the entry block).
  51 ///
  52 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
  53 ///      consist of exact and don't-care instructions, the switch only has to
  54 ///      be done at the entry and exit points rather than potentially in each
  55 ///      block of the region.
  56 ///
  57 //===----------------------------------------------------------------------===//
  58
  59 #include "AMDGPU.h"
  60 #include "AMDGPUSubtarget.h"
  61 #include "SIInstrInfo.h"
  62 #include "SIMachineFunctionInfo.h"
  63 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  64 #include "llvm/ADT/DenseMap.h"
  65 #include "llvm/ADT/PostOrderIterator.h"
  66 #include "llvm/ADT/SmallVector.h"
  67 #include "llvm/ADT/StringRef.h"
  68 #include "llvm/CodeGen/LiveInterval.h"
  69 #include "llvm/CodeGen/LiveIntervals.h"
  70 #include "llvm/CodeGen/MachineBasicBlock.h"
  71 #include "llvm/CodeGen/MachineFunction.h"
  72 #include "llvm/CodeGen/MachineFunctionPass.h"
  73 #include "llvm/CodeGen/MachineInstr.h"
  74 #include "llvm/CodeGen/MachineInstrBuilder.h"
  75 #include "llvm/CodeGen/MachineOperand.h"
  76 #include "llvm/CodeGen/MachineRegisterInfo.h"
  77 #include "llvm/CodeGen/SlotIndexes.h"
  78 #include "llvm/CodeGen/TargetRegisterInfo.h"
  79 #include "llvm/IR/CallingConv.h"
  80 #include "llvm/IR/DebugLoc.h"
  81 #include "llvm/MC/MCRegisterInfo.h"
  82 #include "llvm/Pass.h"
  83 #include "llvm/Support/Debug.h"
  84 #include "llvm/Support/raw_ostream.h"
  85 #include <cassert>
  86 #include <vector>
  87
  88 using namespace llvm;
  89
  90 #define DEBUG_TYPE "si-wqm"
  91
  92 namespace {
  93
  94 enum {
  95   StateWQM = 0x1,
  96   StateWWM = 0x2,
  97   StateExact = 0x4,
  98 };
  99
 100 struct PrintState {
 101 public:
 102   int State;
 103
 104   explicit PrintState(int State) : State(State) {}
 105 };
 106
 107 #ifndef NDEBUG
 108 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
 109   if (PS.State & StateWQM)
 110     OS << "WQM";
 111   if (PS.State & StateWWM) {
 112     if (PS.State & StateWQM)
 113       OS << '|';
 114     OS << "WWM";
 115   }
 116   if (PS.State & StateExact) {
 117     if (PS.State & (StateWQM | StateWWM))
 118       OS << '|';
 119     OS << "Exact";
 120   }
 121
 122   return OS;
 123 }
 124 #endif
 125
 126 struct InstrInfo {
 127   char Needs = 0;
 128   char Disabled = 0;
 129   char OutNeeds = 0;
 130 };
 131
 132 struct BlockInfo {
 133   char Needs = 0;
 134   char InNeeds = 0;
 135   char OutNeeds = 0;
 136 };
 137
 138 struct WorkItem {
 139   MachineBasicBlock *MBB = nullptr;
 140   MachineInstr *MI = nullptr;
 141
 142   WorkItem() = default;
 143   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
 144   WorkItem(MachineInstr *MI) : MI(MI) {}
 145 };
 146
 147 class SIWholeQuadMode : public MachineFunctionPass {
 148 private:
 149   CallingConv::ID CallingConv;
 150   const SIInstrInfo *TII;
 151   const SIRegisterInfo *TRI;
 152   MachineRegisterInfo *MRI;
 153   LiveIntervals *LIS;
 154
 155   DenseMap<const MachineInstr *, InstrInfo> Instructions;
 156   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
 157   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 158   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
 159
 160   void printInfo();
 161
 162   void markInstruction(MachineInstr &MI, char Flag,
 163                        std::vector<WorkItem> &Worklist);
 164   void markInstructionUses(const MachineInstr &MI, char Flag,
 165                            std::vector<WorkItem> &Worklist);
 166   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
 167   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
 168   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
 169   char analyzeFunction(MachineFunction &MF);
 170
 171   bool requiresCorrectState(const MachineInstr &MI) const;
 172
 173   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
 174                                       MachineBasicBlock::iterator Before);
 175   MachineBasicBlock::iterator
 176   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
 177                    MachineBasicBlock::iterator Last, bool PreferLast,
 178                    bool SaveSCC);
 179   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 180                unsigned SaveWQM, unsigned LiveMaskReg);
 181   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 182              unsigned SavedWQM);
 183   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 184              unsigned SaveOrig);
 185   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
 186                unsigned SavedOrig);
 187   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
 188
 189   void lowerLiveMaskQueries(unsigned LiveMaskReg);
 190   void lowerCopyInstrs();
 191
 192 public:
 193   static char ID;
 194
 195   SIWholeQuadMode() :
 196     MachineFunctionPass(ID) { }
 197
 198   bool runOnMachineFunction(MachineFunction &MF) override;
 199
 200   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
 201
 202   void getAnalysisUsage(AnalysisUsage &AU) const override {
 203     AU.addRequired<LiveIntervals>();
 204     AU.setPreservesCFG();
 205     MachineFunctionPass::getAnalysisUsage(AU);
 206   }
 207 };
 208
 209 } // end anonymous namespace
 210
 211 char SIWholeQuadMode::ID = 0;
 212
 213 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
 214                       false)
 215 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 216 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
 217                     false)
 218
 219 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
 220
 221 FunctionPass *llvm::createSIWholeQuadModePass() {
 222   return new SIWholeQuadMode;
 223 }
 224
 225 #ifndef NDEBUG
 226 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
 227   for (const auto &BII : Blocks) {
 228     dbgs() << "\n"
 229            << printMBBReference(*BII.first) << ":\n"
 230            << "  InNeeds = " << PrintState(BII.second.InNeeds)
 231            << ", Needs = " << PrintState(BII.second.Needs)
 232            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
 233
 234     for (const MachineInstr &MI : *BII.first) {
 235       auto III = Instructions.find(&MI);
 236       if (III == Instructions.end())
 237         continue;
 238
 239       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
 240              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
 241     }
 242   }
 243 }
 244 #endif
 245
 246 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 247                                       std::vector<WorkItem> &Worklist) {
 248   InstrInfo &II = Instructions[&MI];
 249
 250   assert(!(Flag & StateExact) && Flag != 0);
 251
 252   // Remove any disabled states from the flag. The user that required it gets
 253   // an undefined value in the helper lanes. For example, this can happen if
 254   // the result of an atomic is used by instruction that requires WQM, where
 255   // ignoring the request for WQM is correct as per the relevant specs.
 256   Flag &= ~II.Disabled;
 257
 258   // Ignore if the flag is already encompassed by the existing needs, or we
 259   // just disabled everything.
 260   if ((II.Needs & Flag) == Flag)
 261     return;
 262
 263   II.Needs |= Flag;
 264   Worklist.push_back(&MI);
 265 }
 266
 267 /// Mark all instructions defining the uses in \p MI with \p Flag.
 268 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
 269                                           std::vector<WorkItem> &Worklist) {
 270   for (const MachineOperand &Use : MI.uses()) {
 271     if (!Use.isReg() || !Use.isUse())
 272       continue;
 273
 274     unsigned Reg = Use.getReg();
 275
 276     // Handle physical registers that we need to track; this is mostly relevant
 277     // for VCC, which can appear as the (implicit) input of a uniform branch,
 278     // e.g. when a loop counter is stored in a VGPR.
 279     if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
 280       if (Reg == AMDGPU::EXEC)
 281         continue;
 282
 283       for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
 284         LiveRange &LR = LIS->getRegUnit(*RegUnit);
 285         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
 286         if (!Value)
 287           continue;
 288
 289         // Since we're in machine SSA, we do not need to track physical
 290         // registers across basic blocks.
 291         if (Value->isPHIDef())
 292           continue;
 293
 294         markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
 295                         Worklist);
 296       }
 297
 298       continue;
 299     }
 300
 301     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
 302       markInstruction(DefMI, Flag, Worklist);
 303   }
 304 }
 305
 306 // Scan instructions to determine which ones require an Exact execmask and
 307 // which ones seed WQM requirements.
 308 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 309                                        std::vector<WorkItem> &Worklist) {
 310   char GlobalFlags = 0;
 311   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
 312   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
 313
 314   // We need to visit the basic blocks in reverse post-order so that we visit
 315   // defs before uses, in particular so that we don't accidentally mark an
 316   // instruction as needing e.g. WQM before visiting it and realizing it needs
 317   // WQM disabled.
 318   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
 319   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
 320     MachineBasicBlock &MBB = **BI;
 321     BlockInfo &BBI = Blocks[&MBB];
 322
 323     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
 324       MachineInstr &MI = *II;
 325       InstrInfo &III = Instructions[&MI];
 326       unsigned Opcode = MI.getOpcode();
 327       char Flags = 0;
 328
 329       if (TII->isWQM(Opcode)) {
 330         // Sampling instructions don't need to produce results for all pixels
 331         // in a quad, they just require all inputs of a quad to have been
 332         // computed for derivatives.
 333         markInstructionUses(MI, StateWQM, Worklist);
 334         GlobalFlags |= StateWQM;
 335         continue;
 336       } else if (Opcode == AMDGPU::WQM) {
 337         // The WQM intrinsic requires its output to have all the helper lanes
 338         // correct, so we need it to be in WQM.
 339         Flags = StateWQM;
 340         LowerToCopyInstrs.push_back(&MI);
 341       } else if (Opcode == AMDGPU::WWM) {
 342         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
 343         // to be executed in WQM or Exact so that its copy doesn't clobber
 344         // inactive lanes.
 345         markInstructionUses(MI, StateWWM, Worklist);
 346         GlobalFlags |= StateWWM;
 347         LowerToCopyInstrs.push_back(&MI);
 348         continue;
 349       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
 350                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
 351         III.Disabled = StateWWM;
 352         MachineOperand &Inactive = MI.getOperand(2);
 353         if (Inactive.isReg()) {
 354           if (Inactive.isUndef()) {
 355             LowerToCopyInstrs.push_back(&MI);
 356           } else {
 357             unsigned Reg = Inactive.getReg();
 358             if (TargetRegisterInfo::isVirtualRegister(Reg)) {
 359               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
 360                 markInstruction(DefMI, StateWWM, Worklist);
 361             }
 362           }
 363         }
 364         SetInactiveInstrs.push_back(&MI);
 365         continue;
 366       } else if (TII->isDisableWQM(MI)) {
 367         BBI.Needs |= StateExact;
 368         if (!(BBI.InNeeds & StateExact)) {
 369           BBI.InNeeds |= StateExact;
 370           Worklist.push_back(&MBB);
 371         }
 372         GlobalFlags |= StateExact;
 373         III.Disabled = StateWQM | StateWWM;
 374         continue;
 375       } else {
 376         if (Opcode == AMDGPU::SI_PS_LIVE) {
 377           LiveMaskQueries.push_back(&MI);
 378         } else if (WQMOutputs) {
 379           // The function is in machine SSA form, which means that physical
 380           // VGPRs correspond to shader inputs and outputs. Inputs are
 381           // only used, outputs are only defined.
 382           for (const MachineOperand &MO : MI.defs()) {
 383             if (!MO.isReg())
 384               continue;
 385
 386             unsigned Reg = MO.getReg();
 387
 388             if (!TRI->isVirtualRegister(Reg) &&
 389                 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
 390               Flags = StateWQM;
 391               break;
 392             }
 393           }
 394         }
 395
 396         if (!Flags)
 397           continue;
 398       }
 399
 400       markInstruction(MI, Flags, Worklist);
 401       GlobalFlags |= Flags;
 402     }
 403   }
 404
 405   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
 406   // ever used anywhere in the function. This implements the corresponding
 407   // semantics of @llvm.amdgcn.set.inactive.
 408   if (GlobalFlags & StateWQM) {
 409     for (MachineInstr *MI : SetInactiveInstrs)
 410       markInstruction(*MI, StateWQM, Worklist);
 411   }
 412
 413   return GlobalFlags;
 414 }
 415
 416 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
 417                                            std::vector<WorkItem>& Worklist) {
 418   MachineBasicBlock *MBB = MI.getParent();
 419   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
 420   BlockInfo &BI = Blocks[MBB];
 421
 422   // Control flow-type instructions and stores to temporary memory that are
 423   // followed by WQM computations must themselves be in WQM.
 424   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
 425       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
 426     Instructions[&MI].Needs = StateWQM;
 427     II.Needs = StateWQM;
 428   }
 429
 430   // Propagate to block level
 431   if (II.Needs & StateWQM) {
 432     BI.Needs |= StateWQM;
 433     if (!(BI.InNeeds & StateWQM)) {
 434       BI.InNeeds |= StateWQM;
 435       Worklist.push_back(MBB);
 436     }
 437   }
 438
 439   // Propagate backwards within block
 440   if (MachineInstr *PrevMI = MI.getPrevNode()) {
 441     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
 442     if (!PrevMI->isPHI()) {
 443       InstrInfo &PrevII = Instructions[PrevMI];
 444       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
 445         PrevII.OutNeeds |= InNeeds;
 446         Worklist.push_back(PrevMI);
 447       }
 448     }
 449   }
 450
 451   // Propagate WQM flag to instruction inputs
 452   assert(!(II.Needs & StateExact));
 453
 454   if (II.Needs != 0)
 455     markInstructionUses(MI, II.Needs, Worklist);
 456
 457   // Ensure we process a block containing WWM, even if it does not require any
 458   // WQM transitions.
 459   if (II.Needs & StateWWM)
 460     BI.Needs |= StateWWM;
 461 }
 462
 463 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
 464                                      std::vector<WorkItem>& Worklist) {
 465   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
 466
 467   // Propagate through instructions
 468   if (!MBB.empty()) {
 469     MachineInstr *LastMI = &*MBB.rbegin();
 470     InstrInfo &LastII = Instructions[LastMI];
 471     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
 472       LastII.OutNeeds |= BI.OutNeeds;
 473       Worklist.push_back(LastMI);
 474     }
 475   }
 476
 477   // Predecessor blocks must provide for our WQM/Exact needs.
 478   for (MachineBasicBlock *Pred : MBB.predecessors()) {
 479     BlockInfo &PredBI = Blocks[Pred];
 480     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
 481       continue;
 482
 483     PredBI.OutNeeds |= BI.InNeeds;
 484     PredBI.InNeeds |= BI.InNeeds;
 485     Worklist.push_back(Pred);
 486   }
 487
 488   // All successors must be prepared to accept the same set of WQM/Exact data.
 489   for (MachineBasicBlock *Succ : MBB.successors()) {
 490     BlockInfo &SuccBI = Blocks[Succ];
 491     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
 492       continue;
 493
 494     SuccBI.InNeeds |= BI.OutNeeds;
 495     Worklist.push_back(Succ);
 496   }
 497 }
 498
 499 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
 500   std::vector<WorkItem> Worklist;
 501   char GlobalFlags = scanInstructions(MF, Worklist);
 502
 503   while (!Worklist.empty()) {
 504     WorkItem WI = Worklist.back();
 505     Worklist.pop_back();
 506
 507     if (WI.MI)
 508       propagateInstruction(*WI.MI, Worklist);
 509     else
 510       propagateBlock(*WI.MBB, Worklist);
 511   }
 512
 513   return GlobalFlags;
 514 }
 515
 516 /// Whether \p MI really requires the exec state computed during analysis.
 517 ///
 518 /// Scalar instructions must occasionally be marked WQM for correct propagation
 519 /// (e.g. thread masks leading up to branches), but when it comes to actual
 520 /// execution, they don't care about EXEC.
 521 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
 522   if (MI.isTerminator())
 523     return true;
 524
 525   // Skip instructions that are not affected by EXEC
 526   if (TII->isScalarUnit(MI))
 527     return false;
 528
 529   // Generic instructions such as COPY will either disappear by register
 530   // coalescing or be lowered to SALU or VALU instructions.
 531   if (MI.isTransient()) {
 532     if (MI.getNumExplicitOperands() >= 1) {
 533       const MachineOperand &Op = MI.getOperand(0);
 534       if (Op.isReg()) {
 535         if (TRI->isSGPRReg(*MRI, Op.getReg())) {
 536           // SGPR instructions are not affected by EXEC
 537           return false;
 538         }
 539       }
 540     }
 541   }
 542
 543   return true;
 544 }
 545
 546 MachineBasicBlock::iterator
 547 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
 548                          MachineBasicBlock::iterator Before) {
 549   unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 550
 551   MachineInstr *Save =
 552       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
 553           .addReg(AMDGPU::SCC);
 554   MachineInstr *Restore =
 555       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
 556           .addReg(SaveReg);
 557
 558   LIS->InsertMachineInstrInMaps(*Save);
 559   LIS->InsertMachineInstrInMaps(*Restore);
 560   LIS->createAndComputeVirtRegInterval(SaveReg);
 561
 562   return Restore;
 563 }
 564
 565 // Return an iterator in the (inclusive) range [First, Last] at which
 566 // instructions can be safely inserted, keeping in mind that some of the
 567 // instructions we want to add necessarily clobber SCC.
 568 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
 569     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
 570     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
 571   if (!SaveSCC)
 572     return PreferLast ? Last : First;
 573
 574   LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
 575   auto MBBE = MBB.end();
 576   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
 577                                      : LIS->getMBBEndIdx(&MBB);
 578   SlotIndex LastIdx =
 579       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
 580   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
 581   const LiveRange::Segment *S;
 582
 583   for (;;) {
 584     S = LR.getSegmentContaining(Idx);
 585     if (!S)
 586       break;
 587
 588     if (PreferLast) {
 589       SlotIndex Next = S->start.getBaseIndex();
 590       if (Next < FirstIdx)
 591         break;
 592       Idx = Next;
 593     } else {
 594       SlotIndex Next = S->end.getNextIndex().getBaseIndex();
 595       if (Next > LastIdx)
 596         break;
 597       Idx = Next;
 598     }
 599   }
 600
 601   MachineBasicBlock::iterator MBBI;
 602
 603   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
 604     MBBI = MI;
 605   else {
 606     assert(Idx == LIS->getMBBEndIdx(&MBB));
 607     MBBI = MBB.end();
 608   }
 609
 610   if (S)
 611     MBBI = saveSCC(MBB, MBBI);
 612
 613   return MBBI;
 614 }
 615
 616 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
 617                               MachineBasicBlock::iterator Before,
 618                               unsigned SaveWQM, unsigned LiveMaskReg) {
 619   MachineInstr *MI;
 620
 621   if (SaveWQM) {
 622     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
 623                  SaveWQM)
 624              .addReg(LiveMaskReg);
 625   } else {
 626     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
 627                  AMDGPU::EXEC)
 628              .addReg(AMDGPU::EXEC)
 629              .addReg(LiveMaskReg);
 630   }
 631
 632   LIS->InsertMachineInstrInMaps(*MI);
 633 }
 634
 635 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
 636                             MachineBasicBlock::iterator Before,
 637                             unsigned SavedWQM) {
 638   MachineInstr *MI;
 639
 640   if (SavedWQM) {
 641     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
 642              .addReg(SavedWQM);
 643   } else {
 644     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
 645                  AMDGPU::EXEC)
 646              .addReg(AMDGPU::EXEC);
 647   }
 648
 649   LIS->InsertMachineInstrInMaps(*MI);
 650 }
 651
 652 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
 653                             MachineBasicBlock::iterator Before,
 654                             unsigned SaveOrig) {
 655   MachineInstr *MI;
 656
 657   assert(SaveOrig);
 658   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
 659                SaveOrig)
 660            .addImm(-1);
 661   LIS->InsertMachineInstrInMaps(*MI);
 662 }
 663
 664 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
 665                               MachineBasicBlock::iterator Before,
 666                               unsigned SavedOrig) {
 667   MachineInstr *MI;
 668
 669   assert(SavedOrig);
 670   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
 671            .addReg(SavedOrig);
 672   LIS->InsertMachineInstrInMaps(*MI);
 673 }
 674
 675 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 676                                    bool isEntry) {
 677   auto BII = Blocks.find(&MBB);
 678   if (BII == Blocks.end())
 679     return;
 680
 681   const BlockInfo &BI = BII->second;
 682
 683   // This is a non-entry block that is WQM throughout, so no need to do
 684   // anything.
 685   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
 686     return;
 687
 688   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
 689                     << ":\n");
 690
 691   unsigned SavedWQMReg = 0;
 692   unsigned SavedNonWWMReg = 0;
 693   bool WQMFromExec = isEntry;
 694   char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
 695   char NonWWMState = 0;
 696
 697   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
 698   if (isEntry)
 699     ++II; // Skip the instruction that saves LiveMask
 700
 701   // This stores the first instruction where it's safe to switch from WQM to
 702   // Exact or vice versa.
 703   MachineBasicBlock::iterator FirstWQM = IE;
 704
 705   // This stores the first instruction where it's safe to switch from WWM to
 706   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
 707   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
 708   // switch to/from WQM as well.
 709   MachineBasicBlock::iterator FirstWWM = IE;
 710   for (;;) {
 711     MachineBasicBlock::iterator Next = II;
 712     char Needs = StateExact | StateWQM; // WWM is disabled by default
 713     char OutNeeds = 0;
 714
 715     if (FirstWQM == IE)
 716       FirstWQM = II;
 717
 718     if (FirstWWM == IE)
 719       FirstWWM = II;
 720
 721     // First, figure out the allowed states (Needs) based on the propagated
 722     // flags.
 723     if (II != IE) {
 724       MachineInstr &MI = *II;
 725
 726       if (requiresCorrectState(MI)) {
 727         auto III = Instructions.find(&MI);
 728         if (III != Instructions.end()) {
 729           if (III->second.Needs & StateWWM)
 730             Needs = StateWWM;
 731           else if (III->second.Needs & StateWQM)
 732             Needs = StateWQM;
 733           else
 734             Needs &= ~III->second.Disabled;
 735           OutNeeds = III->second.OutNeeds;
 736         }
 737       } else {
 738         // If the instruction doesn't actually need a correct EXEC, then we can
 739         // safely leave WWM enabled.
 740         Needs = StateExact | StateWQM | StateWWM;
 741       }
 742
 743       if (MI.isTerminator() && OutNeeds == StateExact)
 744         Needs = StateExact;
 745
 746       if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
 747         MI.getOperand(3).setImm(1);
 748
 749       ++Next;
 750     } else {
 751       // End of basic block
 752       if (BI.OutNeeds & StateWQM)
 753         Needs = StateWQM;
 754       else if (BI.OutNeeds == StateExact)
 755         Needs = StateExact;
 756       else
 757         Needs = StateWQM | StateExact;
 758     }
 759
 760     // Now, transition if necessary.
 761     if (!(Needs & State)) {
 762       MachineBasicBlock::iterator First;
 763       if (State == StateWWM || Needs == StateWWM) {
 764         // We must switch to or from WWM
 765         First = FirstWWM;
 766       } else {
 767         // We only need to switch to/from WQM, so we can use FirstWQM
 768         First = FirstWQM;
 769       }
 770
 771       MachineBasicBlock::iterator Before =
 772           prepareInsertion(MBB, First, II, Needs == StateWQM,
 773                            Needs == StateExact || WQMFromExec);
 774
 775       if (State == StateWWM) {
 776         assert(SavedNonWWMReg);
 777         fromWWM(MBB, Before, SavedNonWWMReg);
 778         State = NonWWMState;
 779       }
 780
 781       if (Needs == StateWWM) {
 782         NonWWMState = State;
 783         SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 784         toWWM(MBB, Before, SavedNonWWMReg);
 785         State = StateWWM;
 786       } else {
 787         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
 788           if (!WQMFromExec && (OutNeeds & StateWQM))
 789             SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 790
 791           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
 792           State = StateExact;
 793         } else if (State == StateExact && (Needs & StateWQM) &&
 794                    !(Needs & StateExact)) {
 795           assert(WQMFromExec == (SavedWQMReg == 0));
 796
 797           toWQM(MBB, Before, SavedWQMReg);
 798
 799           if (SavedWQMReg) {
 800             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
 801             SavedWQMReg = 0;
 802           }
 803           State = StateWQM;
 804         } else {
 805           // We can get here if we transitioned from WWM to a non-WWM state that
 806           // already matches our needs, but we shouldn't need to do anything.
 807           assert(Needs & State);
 808         }
 809       }
 810     }
 811
 812     if (Needs != (StateExact | StateWQM | StateWWM)) {
 813       if (Needs != (StateExact | StateWQM))
 814         FirstWQM = IE;
 815       FirstWWM = IE;
 816     }
 817
 818     if (II == IE)
 819       break;
 820     II = Next;
 821   }
 822 }
 823
 824 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
 825   for (MachineInstr *MI : LiveMaskQueries) {
 826     const DebugLoc &DL = MI->getDebugLoc();
 827     unsigned Dest = MI->getOperand(0).getReg();
 828     MachineInstr *Copy =
 829         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
 830             .addReg(LiveMaskReg);
 831
 832     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
 833     MI->eraseFromParent();
 834   }
 835 }
 836
 837 void SIWholeQuadMode::lowerCopyInstrs() {
 838   for (MachineInstr *MI : LowerToCopyInstrs) {
 839     for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
 840       MI->RemoveOperand(i);
 841     MI->setDesc(TII->get(AMDGPU::COPY));
 842   }
 843 }
 844
 845 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 846   Instructions.clear();
 847   Blocks.clear();
 848   LiveMaskQueries.clear();
 849   LowerToCopyInstrs.clear();
 850   CallingConv = MF.getFunction().getCallingConv();
 851
 852   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 853
 854   TII = ST.getInstrInfo();
 855   TRI = &TII->getRegisterInfo();
 856   MRI = &MF.getRegInfo();
 857   LIS = &getAnalysis<LiveIntervals>();
 858
 859   char GlobalFlags = analyzeFunction(MF);
 860   unsigned LiveMaskReg = 0;
 861   if (!(GlobalFlags & StateWQM)) {
 862     lowerLiveMaskQueries(AMDGPU::EXEC);
 863     if (!(GlobalFlags & StateWWM))
 864       return !LiveMaskQueries.empty();
 865   } else {
 866     // Store a copy of the original live mask when required
 867     MachineBasicBlock &Entry = MF.front();
 868     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
 869
 870     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
 871       LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 872       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
 873                                  TII->get(AMDGPU::COPY), LiveMaskReg)
 874                              .addReg(AMDGPU::EXEC);
 875       LIS->InsertMachineInstrInMaps(*MI);
 876     }
 877
 878     lowerLiveMaskQueries(LiveMaskReg);
 879
 880     if (GlobalFlags == StateWQM) {
 881       // For a shader that needs only WQM, we can just set it once.
 882       BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
 883               AMDGPU::EXEC)
 884           .addReg(AMDGPU::EXEC);
 885
 886       lowerCopyInstrs();
 887       // EntryMI may become invalid here
 888       return true;
 889     }
 890   }
 891
 892   LLVM_DEBUG(printInfo());
 893
 894   lowerCopyInstrs();
 895
 896   // Handle the general case
 897   for (auto BII : Blocks)
 898     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
 899
 900   // Physical registers like SCC aren't tracked by default anyway, so just
 901   // removing the ranges we computed is the simplest option for maintaining
 902   // the analysis results.
 903   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
 904
 905   return true;
 906 }