contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

   1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Insert wait instructions for memory reads and writes.
  12 ///
  13 /// Memory reads and writes are issued asynchronously, so we need to insert
  14 /// S_WAITCNT instructions when we want to access any of their results or
  15 /// overwrite any register that's used asynchronously.
  16 //
  17 //===----------------------------------------------------------------------===//
  18
  19 #include "AMDGPU.h"
  20 #include "AMDGPUSubtarget.h"
  21 #include "SIDefines.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIMachineFunctionInfo.h"
  24 #include "SIRegisterInfo.h"
  25 #include "Utils/AMDGPUBaseInfo.h"
  26 #include "llvm/ADT/DenseMap.h"
  27 #include "llvm/ADT/DenseSet.h"
  28 #include "llvm/ADT/PostOrderIterator.h"
  29 #include "llvm/ADT/STLExtras.h"
  30 #include "llvm/ADT/SmallVector.h"
  31 #include "llvm/CodeGen/MachineBasicBlock.h"
  32 #include "llvm/CodeGen/MachineFunction.h"
  33 #include "llvm/CodeGen/MachineFunctionPass.h"
  34 #include "llvm/CodeGen/MachineInstr.h"
  35 #include "llvm/CodeGen/MachineInstrBuilder.h"
  36 #include "llvm/CodeGen/MachineLoopInfo.h"
  37 #include "llvm/CodeGen/MachineMemOperand.h"
  38 #include "llvm/CodeGen/MachineOperand.h"
  39 #include "llvm/CodeGen/MachineRegisterInfo.h"
  40 #include "llvm/IR/DebugLoc.h"
  41 #include "llvm/Pass.h"
  42 #include "llvm/Support/Debug.h"
  43 #include "llvm/Support/ErrorHandling.h"
  44 #include "llvm/Support/raw_ostream.h"
  45 #include <algorithm>
  46 #include <cassert>
  47 #include <cstdint>
  48 #include <cstring>
  49 #include <memory>
  50 #include <utility>
  51 #include <vector>
  52
  53 #define DEBUG_TYPE "si-insert-waitcnts"
  54
  55 using namespace llvm;
  56
  57 namespace {
  58
  59 // Class of object that encapsulates latest instruction counter score
  60 // associated with the operand.  Used for determining whether
  61 // s_waitcnt instruction needs to be emited.
  62
  63 #define CNT_MASK(t) (1u << (t))
  64
  65 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
  66
  67 using RegInterval = std::pair<signed, signed>;
  68
  69 struct {
  70   int32_t VmcntMax;
  71   int32_t ExpcntMax;
  72   int32_t LgkmcntMax;
  73   int32_t NumVGPRsMax;
  74   int32_t NumSGPRsMax;
  75 } HardwareLimits;
  76
  77 struct {
  78   unsigned VGPR0;
  79   unsigned VGPRL;
  80   unsigned SGPR0;
  81   unsigned SGPRL;
  82 } RegisterEncoding;
  83
  84 enum WaitEventType {
  85   VMEM_ACCESS,      // vector-memory read & write
  86   LDS_ACCESS,       // lds read & write
  87   GDS_ACCESS,       // gds read & write
  88   SQ_MESSAGE,       // send message
  89   SMEM_ACCESS,      // scalar-memory read & write
  90   EXP_GPR_LOCK,     // export holding on its data src
  91   GDS_GPR_LOCK,     // GDS holding on its data and addr src
  92   EXP_POS_ACCESS,   // write to export position
  93   EXP_PARAM_ACCESS, // write to export parameter
  94   VMW_GPR_LOCK,     // vector-memory write holding on its data src
  95   NUM_WAIT_EVENTS,
  96 };
  97
  98 // The mapping is:
  99 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
 100 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
 101 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
 102 // We reserve a fixed number of VGPR slots in the scoring tables for
 103 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
 104 enum RegisterMapping {
 105   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
 106   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
 107   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
 108   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
 109   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
 110 };
 111
 112 #define ForAllWaitEventType(w)                                                 \
 113   for (enum WaitEventType w = (enum WaitEventType)0;                           \
 114        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
 115        (w) = (enum WaitEventType)((w) + 1))
 116
 117 // This is a per-basic-block object that maintains current score brackets
 118 // of each wait-counter, and a per-register scoreboard for each wait-couner.
 119 // We also maintain the latest score for every event type that can change the
 120 // waitcnt in order to know if there are multiple types of events within
 121 // the brackets. When multiple types of event happen in the bracket,
 122 // wait-count may get decreased out of order, therefore we need to put in
 123 // "s_waitcnt 0" before use.
 124 class BlockWaitcntBrackets {
 125 public:
 126   BlockWaitcntBrackets() {
 127     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
 128          T = (enum InstCounterType)(T + 1)) {
 129       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
 130     }
 131   }
 132
 133   ~BlockWaitcntBrackets() = default;
 134
 135   static int32_t getWaitCountMax(InstCounterType T) {
 136     switch (T) {
 137     case VM_CNT:
 138       return HardwareLimits.VmcntMax;
 139     case LGKM_CNT:
 140       return HardwareLimits.LgkmcntMax;
 141     case EXP_CNT:
 142       return HardwareLimits.ExpcntMax;
 143     default:
 144       break;
 145     }
 146     return 0;
 147   }
 148
 149   void setScoreLB(InstCounterType T, int32_t Val) {
 150     assert(T < NUM_INST_CNTS);
 151     if (T >= NUM_INST_CNTS)
 152       return;
 153     ScoreLBs[T] = Val;
 154   }
 155
 156   void setScoreUB(InstCounterType T, int32_t Val) {
 157     assert(T < NUM_INST_CNTS);
 158     if (T >= NUM_INST_CNTS)
 159       return;
 160     ScoreUBs[T] = Val;
 161     if (T == EXP_CNT) {
 162       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
 163       if (ScoreLBs[T] < UB)
 164         ScoreLBs[T] = UB;
 165     }
 166   }
 167
 168   int32_t getScoreLB(InstCounterType T) {
 169     assert(T < NUM_INST_CNTS);
 170     if (T >= NUM_INST_CNTS)
 171       return 0;
 172     return ScoreLBs[T];
 173   }
 174
 175   int32_t getScoreUB(InstCounterType T) {
 176     assert(T < NUM_INST_CNTS);
 177     if (T >= NUM_INST_CNTS)
 178       return 0;
 179     return ScoreUBs[T];
 180   }
 181
 182   // Mapping from event to counter.
 183   InstCounterType eventCounter(WaitEventType E) {
 184     switch (E) {
 185     case VMEM_ACCESS:
 186       return VM_CNT;
 187     case LDS_ACCESS:
 188     case GDS_ACCESS:
 189     case SQ_MESSAGE:
 190     case SMEM_ACCESS:
 191       return LGKM_CNT;
 192     case EXP_GPR_LOCK:
 193     case GDS_GPR_LOCK:
 194     case VMW_GPR_LOCK:
 195     case EXP_POS_ACCESS:
 196     case EXP_PARAM_ACCESS:
 197       return EXP_CNT;
 198     default:
 199       llvm_unreachable("unhandled event type");
 200     }
 201     return NUM_INST_CNTS;
 202   }
 203
 204   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
 205     if (GprNo < NUM_ALL_VGPRS) {
 206       if (GprNo > VgprUB) {
 207         VgprUB = GprNo;
 208       }
 209       VgprScores[T][GprNo] = Val;
 210     } else {
 211       assert(T == LGKM_CNT);
 212       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
 213         SgprUB = GprNo - NUM_ALL_VGPRS;
 214       }
 215       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
 216     }
 217   }
 218
 219   int32_t getRegScore(int GprNo, InstCounterType T) {
 220     if (GprNo < NUM_ALL_VGPRS) {
 221       return VgprScores[T][GprNo];
 222     }
 223     return SgprScores[GprNo - NUM_ALL_VGPRS];
 224   }
 225
 226   void clear() {
 227     memset(ScoreLBs, 0, sizeof(ScoreLBs));
 228     memset(ScoreUBs, 0, sizeof(ScoreUBs));
 229     memset(EventUBs, 0, sizeof(EventUBs));
 230     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
 231          T = (enum InstCounterType)(T + 1)) {
 232       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
 233     }
 234     memset(SgprScores, 0, sizeof(SgprScores));
 235   }
 236
 237   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
 238                              const MachineRegisterInfo *MRI,
 239                              const SIRegisterInfo *TRI, unsigned OpNo,
 240                              bool Def) const;
 241
 242   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
 243                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
 244                    unsigned OpNo, int32_t Val);
 245
 246   void setWaitAtBeginning() { WaitAtBeginning = true; }
 247   void clearWaitAtBeginning() { WaitAtBeginning = false; }
 248   bool getWaitAtBeginning() const { return WaitAtBeginning; }
 249   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
 250   int32_t getMaxVGPR() const { return VgprUB; }
 251   int32_t getMaxSGPR() const { return SgprUB; }
 252
 253   int32_t getEventUB(enum WaitEventType W) const {
 254     assert(W < NUM_WAIT_EVENTS);
 255     return EventUBs[W];
 256   }
 257
 258   bool counterOutOfOrder(InstCounterType T);
 259   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
 260   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
 261                      const MachineRegisterInfo *MRI, WaitEventType E,
 262                      MachineInstr &MI);
 263
 264   bool hasPendingSMEM() const {
 265     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
 266             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
 267   }
 268
 269   bool hasPendingFlat() const {
 270     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
 271              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
 272             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
 273              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
 274   }
 275
 276   void setPendingFlat() {
 277     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
 278     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
 279   }
 280
 281   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
 282
 283   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
 284
 285   bool getRevisitLoop() const { return RevisitLoop; }
 286   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
 287
 288   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
 289   int32_t getPostOrder() const { return PostOrder; }
 290
 291   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
 292   void clearWaitcnt() { Waitcnt = nullptr; }
 293   MachineInstr *getWaitcnt() const { return Waitcnt; }
 294
 295   bool mixedExpTypes() const { return MixedExpTypes; }
 296   void setMixedExpTypes(bool MixedExpTypesIn) {
 297     MixedExpTypes = MixedExpTypesIn;
 298   }
 299
 300   void print(raw_ostream &);
 301   void dump() { print(dbgs()); }
 302
 303 private:
 304   bool WaitAtBeginning = false;
 305   bool RevisitLoop = false;
 306   bool MixedExpTypes = false;
 307   int32_t PostOrder = 0;
 308   MachineInstr *Waitcnt = nullptr;
 309   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
 310   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
 311   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
 312   // Remember the last flat memory operation.
 313   int32_t LastFlat[NUM_INST_CNTS] = {0};
 314   // wait_cnt scores for every vgpr.
 315   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
 316   int32_t VgprUB = 0;
 317   int32_t SgprUB = 0;
 318   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
 319   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
 320   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
 321 };
 322
 323 // This is a per-loop-region object that records waitcnt status at the end of
 324 // loop footer from the previous iteration. We also maintain an iteration
 325 // count to track the number of times the loop has been visited. When it
 326 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
 327 // at the end of the loop footer.
 328 class LoopWaitcntData {
 329 public:
 330   LoopWaitcntData() = default;
 331   ~LoopWaitcntData() = default;
 332
 333   void incIterCnt() { IterCnt++; }
 334   void resetIterCnt() { IterCnt = 0; }
 335   int32_t getIterCnt() { return IterCnt; }
 336
 337   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
 338   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
 339
 340   void print() {
 341     DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
 342   }
 343
 344 private:
 345   // s_waitcnt added at the end of loop footer to stablize wait scores
 346   // at the end of the loop footer.
 347   MachineInstr *LfWaitcnt = nullptr;
 348   // Number of iterations the loop has been visited, not including the initial
 349   // walk over.
 350   int32_t IterCnt = 0;
 351 };
 352
 353 class SIInsertWaitcnts : public MachineFunctionPass {
 354 private:
 355   const SISubtarget *ST = nullptr;
 356   const SIInstrInfo *TII = nullptr;
 357   const SIRegisterInfo *TRI = nullptr;
 358   const MachineRegisterInfo *MRI = nullptr;
 359   const MachineLoopInfo *MLI = nullptr;
 360   AMDGPU::IsaInfo::IsaVersion IV;
 361   AMDGPUAS AMDGPUASI;
 362
 363   DenseSet<MachineBasicBlock *> BlockVisitedSet;
 364   DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
 365   DenseSet<MachineInstr *> VCCZBugHandledSet;
 366
 367   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
 368       BlockWaitcntBracketsMap;
 369
 370   DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
 371
 372   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
 373
 374   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
 375
 376 public:
 377   static char ID;
 378
 379   SIInsertWaitcnts() : MachineFunctionPass(ID) {}
 380
 381   bool runOnMachineFunction(MachineFunction &MF) override;
 382
 383   StringRef getPassName() const override {
 384     return "SI insert wait instructions";
 385   }
 386
 387   void getAnalysisUsage(AnalysisUsage &AU) const override {
 388     AU.setPreservesCFG();
 389     AU.addRequired<MachineLoopInfo>();
 390     MachineFunctionPass::getAnalysisUsage(AU);
 391   }
 392
 393   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
 394     // The waitcnt information is copied because it changes as the block is
 395     // traversed.
 396     KillWaitBrackets.push_back(
 397         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
 398   }
 399
 400   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
 401   MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
 402                                            BlockWaitcntBrackets *ScoreBrackets);
 403   void updateEventWaitCntAfter(MachineInstr &Inst,
 404                                BlockWaitcntBrackets *ScoreBrackets);
 405   void mergeInputScoreBrackets(MachineBasicBlock &Block);
 406   MachineBasicBlock *loopBottom(const MachineLoop *Loop);
 407   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
 408   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
 409 };
 410
 411 } // end anonymous namespace
 412
 413 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
 414                                                  const SIInstrInfo *TII,
 415                                                  const MachineRegisterInfo *MRI,
 416                                                  const SIRegisterInfo *TRI,
 417                                                  unsigned OpNo,
 418                                                  bool Def) const {
 419   const MachineOperand &Op = MI->getOperand(OpNo);
 420   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
 421       (Def && !Op.isDef()))
 422     return {-1, -1};
 423
 424   // A use via a PW operand does not need a waitcnt.
 425   // A partial write is not a WAW.
 426   assert(!Op.getSubReg() || !Op.isUndef());
 427
 428   RegInterval Result;
 429   const MachineRegisterInfo &MRIA = *MRI;
 430
 431   unsigned Reg = TRI->getEncodingValue(Op.getReg());
 432
 433   if (TRI->isVGPR(MRIA, Op.getReg())) {
 434     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
 435     Result.first = Reg - RegisterEncoding.VGPR0;
 436     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
 437   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
 438     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
 439     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
 440     assert(Result.first >= NUM_ALL_VGPRS &&
 441            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
 442   }
 443   // TODO: Handle TTMP
 444   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
 445   else
 446     return {-1, -1};
 447
 448   const MachineInstr &MIA = *MI;
 449   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
 450   unsigned Size = TRI->getRegSizeInBits(*RC);
 451   Result.second = Result.first + (Size / 32);
 452
 453   return Result;
 454 }
 455
 456 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
 457                                        const SIInstrInfo *TII,
 458                                        const SIRegisterInfo *TRI,
 459                                        const MachineRegisterInfo *MRI,
 460                                        unsigned OpNo, int32_t Val) {
 461   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
 462   DEBUG({
 463     const MachineOperand &Opnd = MI->getOperand(OpNo);
 464     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
 465   });
 466   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
 467     setRegScore(RegNo, EXP_CNT, Val);
 468   }
 469 }
 470
 471 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 472                                          const SIRegisterInfo *TRI,
 473                                          const MachineRegisterInfo *MRI,
 474                                          WaitEventType E, MachineInstr &Inst) {
 475   const MachineRegisterInfo &MRIA = *MRI;
 476   InstCounterType T = eventCounter(E);
 477   int32_t CurrScore = getScoreUB(T) + 1;
 478   // EventUB and ScoreUB need to be update regardless if this event changes
 479   // the score of a register or not.
 480   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
 481   EventUBs[E] = CurrScore;
 482   setScoreUB(T, CurrScore);
 483
 484   if (T == EXP_CNT) {
 485     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
 486     // is required.
 487     if (!MixedExpTypes) {
 488       MixedExpTypes = counterOutOfOrder(EXP_CNT);
 489     }
 490
 491     // Put score on the source vgprs. If this is a store, just use those
 492     // specific register(s).
 493     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
 494       // All GDS operations must protect their address register (same as
 495       // export.)
 496       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
 497           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
 498         setExpScore(
 499             &Inst, TII, TRI, MRI,
 500             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
 501             CurrScore);
 502       }
 503       if (Inst.mayStore()) {
 504         setExpScore(
 505             &Inst, TII, TRI, MRI,
 506             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
 507             CurrScore);
 508         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
 509                                        AMDGPU::OpName::data1) != -1) {
 510           setExpScore(&Inst, TII, TRI, MRI,
 511                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
 512                                                  AMDGPU::OpName::data1),
 513                       CurrScore);
 514         }
 515       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
 516                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
 517                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
 518                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
 519                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
 520                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
 521                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
 522                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
 523                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
 524         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
 525           const MachineOperand &Op = Inst.getOperand(I);
 526           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
 527             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
 528           }
 529         }
 530       }
 531     } else if (TII->isFLAT(Inst)) {
 532       if (Inst.mayStore()) {
 533         setExpScore(
 534             &Inst, TII, TRI, MRI,
 535             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
 536             CurrScore);
 537       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
 538         setExpScore(
 539             &Inst, TII, TRI, MRI,
 540             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
 541             CurrScore);
 542       }
 543     } else if (TII->isMIMG(Inst)) {
 544       if (Inst.mayStore()) {
 545         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
 546       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
 547         setExpScore(
 548             &Inst, TII, TRI, MRI,
 549             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
 550             CurrScore);
 551       }
 552     } else if (TII->isMTBUF(Inst)) {
 553       if (Inst.mayStore()) {
 554         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
 555       }
 556     } else if (TII->isMUBUF(Inst)) {
 557       if (Inst.mayStore()) {
 558         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
 559       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
 560         setExpScore(
 561             &Inst, TII, TRI, MRI,
 562             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
 563             CurrScore);
 564       }
 565     } else {
 566       if (TII->isEXP(Inst)) {
 567         // For export the destination registers are really temps that
 568         // can be used as the actual source after export patching, so
 569         // we need to treat them like sources and set the EXP_CNT
 570         // score.
 571         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
 572           MachineOperand &DefMO = Inst.getOperand(I);
 573           if (DefMO.isReg() && DefMO.isDef() &&
 574               TRI->isVGPR(MRIA, DefMO.getReg())) {
 575             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
 576                         CurrScore);
 577           }
 578         }
 579       }
 580       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
 581         MachineOperand &MO = Inst.getOperand(I);
 582         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
 583           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
 584         }
 585       }
 586     }
 587 #if 0 // TODO: check if this is handled by MUBUF code above.
 588   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
 589        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
 590        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
 591     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
 592     unsigned OpNo;//TODO: find the OpNo for this operand;
 593     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
 594     for (signed RegNo = Interval.first; RegNo < Interval.second;
 595     ++RegNo) {
 596       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
 597     }
 598 #endif
 599   } else {
 600     // Match the score to the destination registers.
 601     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
 602       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
 603       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
 604         continue;
 605       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
 606         setRegScore(RegNo, T, CurrScore);
 607       }
 608     }
 609     if (TII->isDS(Inst) && Inst.mayStore()) {
 610       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
 611     }
 612   }
 613 }
 614
 615 void BlockWaitcntBrackets::print(raw_ostream &OS) {
 616   OS << '\n';
 617   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
 618        T = (enum InstCounterType)(T + 1)) {
 619     int LB = getScoreLB(T);
 620     int UB = getScoreUB(T);
 621
 622     switch (T) {
 623     case VM_CNT:
 624       OS << "    VM_CNT(" << UB - LB << "): ";
 625       break;
 626     case LGKM_CNT:
 627       OS << "    LGKM_CNT(" << UB - LB << "): ";
 628       break;
 629     case EXP_CNT:
 630       OS << "    EXP_CNT(" << UB - LB << "): ";
 631       break;
 632     default:
 633       OS << "    UNKNOWN(" << UB - LB << "): ";
 634       break;
 635     }
 636
 637     if (LB < UB) {
 638       // Print vgpr scores.
 639       for (int J = 0; J <= getMaxVGPR(); J++) {
 640         int RegScore = getRegScore(J, T);
 641         if (RegScore <= LB)
 642           continue;
 643         int RelScore = RegScore - LB - 1;
 644         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
 645           OS << RelScore << ":v" << J << " ";
 646         } else {
 647           OS << RelScore << ":ds ";
 648         }
 649       }
 650       // Also need to print sgpr scores for lgkm_cnt.
 651       if (T == LGKM_CNT) {
 652         for (int J = 0; J <= getMaxSGPR(); J++) {
 653           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
 654           if (RegScore <= LB)
 655             continue;
 656           int RelScore = RegScore - LB - 1;
 657           OS << RelScore << ":s" << J << " ";
 658         }
 659       }
 660     }
 661     OS << '\n';
 662   }
 663   OS << '\n';
 664 }
 665
 666 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
 667                                                 int ScoreToWait) {
 668   unsigned int NeedWait = 0;
 669   if (ScoreToWait == -1) {
 670     // The score to wait is unknown. This implies that it was not encountered
 671     // during the path of the CFG walk done during the current traversal but
 672     // may be seen on a different path. Emit an s_wait counter with a
 673     // conservative value of 0 for the counter.
 674     NeedWait = CNT_MASK(T);
 675     setScoreLB(T, getScoreUB(T));
 676     return NeedWait;
 677   }
 678
 679   // If the score of src_operand falls within the bracket, we need an
 680   // s_waitcnt instruction.
 681   const int32_t LB = getScoreLB(T);
 682   const int32_t UB = getScoreUB(T);
 683   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
 684     if (T == VM_CNT && hasPendingFlat()) {
 685       // If there is a pending FLAT operation, and this is a VM waitcnt,
 686       // then we need to force a waitcnt 0 for VM.
 687       NeedWait = CNT_MASK(T);
 688       setScoreLB(T, getScoreUB(T));
 689     } else if (counterOutOfOrder(T)) {
 690       // Counter can get decremented out-of-order when there
 691       // are multiple types event in the brack. Also emit an s_wait counter
 692       // with a conservative value of 0 for the counter.
 693       NeedWait = CNT_MASK(T);
 694       setScoreLB(T, getScoreUB(T));
 695     } else {
 696       NeedWait = CNT_MASK(T);
 697       setScoreLB(T, ScoreToWait);
 698     }
 699   }
 700
 701   return NeedWait;
 702 }
 703
 704 // Where there are multiple types of event in the bracket of a counter,
 705 // the decrement may go out of order.
 706 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
 707   switch (T) {
 708   case VM_CNT:
 709     return false;
 710   case LGKM_CNT: {
 711     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
 712         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
 713       // Scalar memory read always can go out of order.
 714       return true;
 715     }
 716     int NumEventTypes = 0;
 717     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
 718         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
 719       NumEventTypes++;
 720     }
 721     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
 722         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
 723       NumEventTypes++;
 724     }
 725     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
 726         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
 727       NumEventTypes++;
 728     }
 729     if (NumEventTypes <= 1) {
 730       return false;
 731     }
 732     break;
 733   }
 734   case EXP_CNT: {
 735     // If there has been a mixture of export types, then a waitcnt exp(0) is
 736     // required.
 737     if (MixedExpTypes)
 738       return true;
 739     int NumEventTypes = 0;
 740     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
 741         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
 742       NumEventTypes++;
 743     }
 744     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
 745         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
 746       NumEventTypes++;
 747     }
 748     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
 749         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
 750       NumEventTypes++;
 751     }
 752     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
 753         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
 754       NumEventTypes++;
 755     }
 756
 757     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
 758         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
 759       NumEventTypes++;
 760     }
 761
 762     if (NumEventTypes <= 1) {
 763       return false;
 764     }
 765     break;
 766   }
 767   default:
 768     break;
 769   }
 770   return true;
 771 }
 772
 773 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
 774                       false)
 775 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
 776                     false)
 777
 778 char SIInsertWaitcnts::ID = 0;
 779
 780 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
 781
 782 FunctionPass *llvm::createSIInsertWaitcntsPass() {
 783   return new SIInsertWaitcnts();
 784 }
 785
 786 static bool readsVCCZ(const MachineInstr &MI) {
 787   unsigned Opc = MI.getOpcode();
 788   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
 789          !MI.getOperand(1).isUndef();
 790 }
 791
 792 ///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
 793 ///  Instructions of a given type are returned in order,
 794 ///  but instructions of different types can complete out of order.
 795 ///  We rely on this in-order completion
 796 ///  and simply assign a score to the memory access instructions.
 797 ///  We keep track of the active "score bracket" to determine
 798 ///  if an access of a memory read requires an s_waitcnt
 799 ///  and if so what the value of each counter is.
 800 ///  The "score bracket" is bound by the lower bound and upper bound
 801 ///  scores (*_score_LB and *_score_ub respectively).
 802 MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
 803     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
 804   // To emit, or not to emit - that's the question!
 805   // Start with an assumption that there is no need to emit.
 806   unsigned int EmitSwaitcnt = 0;
 807   // s_waitcnt instruction to return; default is NULL.
 808   MachineInstr *SWaitInst = nullptr;
 809   // No need to wait before phi. If a phi-move exists, then the wait should
 810   // has been inserted before the move. If a phi-move does not exist, then
 811   // wait should be inserted before the real use. The same is true for
 812   // sc-merge. It is not a coincident that all these cases correspond to the
 813   // instructions that are skipped in the assembling loop.
 814   bool NeedLineMapping = false; // TODO: Check on this.
 815   if (MI.isDebugValue() &&
 816       // TODO: any other opcode?
 817       !NeedLineMapping) {
 818     return SWaitInst;
 819   }
 820
 821   // See if an s_waitcnt is forced at block entry, or is needed at
 822   // program end.
 823   if (ScoreBrackets->getWaitAtBeginning()) {
 824     // Note that we have already cleared the state, so we don't need to update
 825     // it.
 826     ScoreBrackets->clearWaitAtBeginning();
 827     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
 828          T = (enum InstCounterType)(T + 1)) {
 829       EmitSwaitcnt |= CNT_MASK(T);
 830       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
 831     }
 832   }
 833
 834   // See if this instruction has a forced S_WAITCNT VM.
 835   // TODO: Handle other cases of NeedsWaitcntVmBefore()
 836   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
 837            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
 838            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
 839     EmitSwaitcnt |=
 840         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
 841   }
 842
 843   // All waits must be resolved at call return.
 844   // NOTE: this could be improved with knowledge of all call sites or
 845   //   with knowledge of the called routines.
 846   if (MI.getOpcode() == AMDGPU::RETURN ||
 847       MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
 848       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
 849     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
 850          T = (enum InstCounterType)(T + 1)) {
 851       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
 852         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
 853         EmitSwaitcnt |= CNT_MASK(T);
 854       }
 855     }
 856   }
 857   // Resolve vm waits before gs-done.
 858   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
 859             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
 860            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
 861             AMDGPU::SendMsg::ID_GS_DONE)) {
 862     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
 863       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
 864       EmitSwaitcnt |= CNT_MASK(VM_CNT);
 865     }
 866   }
 867 #if 0 // TODO: the following blocks of logic when we have fence.
 868   else if (MI.getOpcode() == SC_FENCE) {
 869     const unsigned int group_size =
 870       context->shader_info->GetMaxThreadGroupSize();
 871     // group_size == 0 means thread group size is unknown at compile time
 872     const bool group_is_multi_wave =
 873       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
 874     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
 875
 876     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
 877       SCRegType src_type = Inst->GetSrcType(i);
 878       switch (src_type) {
 879         case SCMEM_LDS:
 880           if (group_is_multi_wave ||
 881             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
 882             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
 883                                ScoreBrackets->getScoreUB(LGKM_CNT));
 884             // LDS may have to wait for VM_CNT after buffer load to LDS
 885             if (target_info->HasBufferLoadToLDS()) {
 886               EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
 887                                  ScoreBrackets->getScoreUB(VM_CNT));
 888             }
 889           }
 890           break;
 891
 892         case SCMEM_GDS:
 893           if (group_is_multi_wave || fence_is_global) {
 894             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
 895               ScoreBrackets->getScoreUB(EXP_CNT));
 896             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
 897               ScoreBrackets->getScoreUB(LGKM_CNT));
 898           }
 899           break;
 900
 901         case SCMEM_UAV:
 902         case SCMEM_TFBUF:
 903         case SCMEM_RING:
 904         case SCMEM_SCATTER:
 905           if (group_is_multi_wave || fence_is_global) {
 906             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
 907               ScoreBrackets->getScoreUB(EXP_CNT));
 908             EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
 909               ScoreBrackets->getScoreUB(VM_CNT));
 910           }
 911           break;
 912
 913         case SCMEM_SCRATCH:
 914         default:
 915           break;
 916       }
 917     }
 918   }
 919 #endif
 920
 921   // Export & GDS instructions do not read the EXEC mask until after the export
 922   // is granted (which can occur well after the instruction is issued).
 923   // The shader program must flush all EXP operations on the export-count
 924   // before overwriting the EXEC mask.
 925   else {
 926     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
 927       // Export and GDS are tracked individually, either may trigger a waitcnt
 928       // for EXEC.
 929       EmitSwaitcnt |= ScoreBrackets->updateByWait(
 930           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
 931       EmitSwaitcnt |= ScoreBrackets->updateByWait(
 932           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
 933       EmitSwaitcnt |= ScoreBrackets->updateByWait(
 934           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
 935       EmitSwaitcnt |= ScoreBrackets->updateByWait(
 936           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
 937     }
 938
 939 #if 0 // TODO: the following code to handle CALL.
 940     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
 941     // However, there is a problem with EXP_CNT, because the call cannot
 942     // easily tell if a register is used in the function, and if it did, then
 943     // the referring instruction would have to have an S_WAITCNT, which is
 944     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
 945     // before the call.
 946     if (MI.getOpcode() == SC_CALL) {
 947       if (ScoreBrackets->getScoreUB(EXP_CNT) >
 948         ScoreBrackets->getScoreLB(EXP_CNT)) {
 949         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
 950         EmitSwaitcnt |= CNT_MASK(EXP_CNT);
 951       }
 952     }
 953 #endif
 954
 955     // FIXME: Should not be relying on memoperands.
 956     // Look at the source operands of every instruction to see if
 957     // any of them results from a previous memory operation that affects
 958     // its current usage. If so, an s_waitcnt instruction needs to be
 959     // emitted.
 960     // If the source operand was defined by a load, add the s_waitcnt
 961     // instruction.
 962     for (const MachineMemOperand *Memop : MI.memoperands()) {
 963       unsigned AS = Memop->getAddrSpace();
 964       if (AS != AMDGPUASI.LOCAL_ADDRESS)
 965         continue;
 966       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
 967       // VM_CNT is only relevant to vgpr or LDS.
 968       EmitSwaitcnt |= ScoreBrackets->updateByWait(
 969           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
 970     }
 971
 972     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
 973       const MachineOperand &Op = MI.getOperand(I);
 974       const MachineRegisterInfo &MRIA = *MRI;
 975       RegInterval Interval =
 976           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
 977       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
 978         if (TRI->isVGPR(MRIA, Op.getReg())) {
 979           // VM_CNT is only relevant to vgpr or LDS.
 980           EmitSwaitcnt |= ScoreBrackets->updateByWait(
 981               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
 982         }
 983         EmitSwaitcnt |= ScoreBrackets->updateByWait(
 984             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
 985       }
 986     }
 987     // End of for loop that looks at all source operands to decide vm_wait_cnt
 988     // and lgk_wait_cnt.
 989
 990     // Two cases are handled for destination operands:
 991     // 1) If the destination operand was defined by a load, add the s_waitcnt
 992     // instruction to guarantee the right WAW order.
 993     // 2) If a destination operand that was used by a recent export/store ins,
 994     // add s_waitcnt on exp_cnt to guarantee the WAR order.
 995     if (MI.mayStore()) {
 996       // FIXME: Should not be relying on memoperands.
 997       for (const MachineMemOperand *Memop : MI.memoperands()) {
 998         unsigned AS = Memop->getAddrSpace();
 999         if (AS != AMDGPUASI.LOCAL_ADDRESS)
1000           continue;
1001         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1002         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1003             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1004         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1005             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1006       }
1007     }
1008     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1009       MachineOperand &Def = MI.getOperand(I);
1010       const MachineRegisterInfo &MRIA = *MRI;
1011       RegInterval Interval =
1012           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1013       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1014         if (TRI->isVGPR(MRIA, Def.getReg())) {
1015           EmitSwaitcnt |= ScoreBrackets->updateByWait(
1016               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1017           EmitSwaitcnt |= ScoreBrackets->updateByWait(
1018               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1019         }
1020         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1021             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1022       }
1023     } // End of for loop that looks at all dest operands.
1024   }
1025
1026   // TODO: Tie force zero to a compiler triage option.
1027   bool ForceZero = false;
1028
1029   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1030   // occurs before the instruction. Doing it here prevents any additional
1031   // S_WAITCNTs from being emitted if the instruction was marked as
1032   // requiring a WAITCNT beforehand.
1033   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1034       !ST->hasAutoWaitcntBeforeBarrier()) {
1035     EmitSwaitcnt |=
1036         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1037     EmitSwaitcnt |= ScoreBrackets->updateByWait(
1038         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1039     EmitSwaitcnt |= ScoreBrackets->updateByWait(
1040         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1041   }
1042
1043   // TODO: Remove this work-around, enable the assert for Bug 457939
1044   //       after fixing the scheduler. Also, the Shader Compiler code is
1045   //       independent of target.
1046   if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
1047     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1048             ScoreBrackets->getScoreUB(LGKM_CNT) &&
1049         ScoreBrackets->hasPendingSMEM()) {
1050       // Wait on everything, not just LGKM.  vccz reads usually come from
1051       // terminators, and we always wait on everything at the end of the
1052       // block, so if we only wait on LGKM here, we might end up with
1053       // another s_waitcnt inserted right after this if there are non-LGKM
1054       // instructions still outstanding.
1055       ForceZero = true;
1056       EmitSwaitcnt = true;
1057     }
1058   }
1059
1060   // Does this operand processing indicate s_wait counter update?
1061   if (EmitSwaitcnt) {
1062     int CntVal[NUM_INST_CNTS];
1063
1064     bool UseDefaultWaitcntStrategy = true;
1065     if (ForceZero) {
1066       // Force all waitcnts to 0.
1067       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1068            T = (enum InstCounterType)(T + 1)) {
1069         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1070       }
1071       CntVal[VM_CNT] = 0;
1072       CntVal[EXP_CNT] = 0;
1073       CntVal[LGKM_CNT] = 0;
1074       UseDefaultWaitcntStrategy = false;
1075     }
1076
1077     if (UseDefaultWaitcntStrategy) {
1078       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1079            T = (enum InstCounterType)(T + 1)) {
1080         if (EmitSwaitcnt & CNT_MASK(T)) {
1081           int Delta =
1082               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1083           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1084           if (Delta >= MaxDelta) {
1085             Delta = -1;
1086             if (T != EXP_CNT) {
1087               ScoreBrackets->setScoreLB(
1088                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1089             }
1090             EmitSwaitcnt &= ~CNT_MASK(T);
1091           }
1092           CntVal[T] = Delta;
1093         } else {
1094           // If we are not waiting for a particular counter then encode
1095           // it as -1 which means "don't care."
1096           CntVal[T] = -1;
1097         }
1098       }
1099     }
1100
1101     // If we are not waiting on any counter we can skip the wait altogether.
1102     if (EmitSwaitcnt != 0) {
1103       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1104       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1105       if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
1106                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1107           (AMDGPU::decodeExpcnt(IV, Imm) !=
1108            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1109           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1110            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1111         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1112         if (ContainingLoop) {
1113           MachineBasicBlock *TBB = ContainingLoop->getHeader();
1114           BlockWaitcntBrackets *ScoreBracket =
1115               BlockWaitcntBracketsMap[TBB].get();
1116           if (!ScoreBracket) {
1117             assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
1118             BlockWaitcntBracketsMap[TBB] =
1119                 llvm::make_unique<BlockWaitcntBrackets>();
1120             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1121           }
1122           ScoreBracket->setRevisitLoop(true);
1123           DEBUG(dbgs() << "set-revisit: block"
1124                        << ContainingLoop->getHeader()->getNumber() << '\n';);
1125         }
1126       }
1127
1128       // Update an existing waitcount, or make a new one.
1129       MachineFunction &MF = *MI.getParent()->getParent();
1130       if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
1131         SWaitInst = OldWaitcnt;
1132       } else {
1133         SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
1134                                           MI.getDebugLoc());
1135         CompilerGeneratedWaitcntSet.insert(SWaitInst);
1136       }
1137
1138       const MachineOperand &Op =
1139           MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
1140               IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
1141       SWaitInst->addOperand(MF, Op);
1142
1143       if (CntVal[EXP_CNT] == 0) {
1144         ScoreBrackets->setMixedExpTypes(false);
1145       }
1146     }
1147   }
1148
1149   return SWaitInst;
1150 }
1151
1152 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1153                                              MachineInstr *Waitcnt) {
1154   if (MBB.empty()) {
1155     MBB.push_back(Waitcnt);
1156     return;
1157   }
1158
1159   MachineBasicBlock::iterator It = MBB.end();
1160   MachineInstr *MI = &*(--It);
1161   if (MI->isBranch()) {
1162     MBB.insert(It, Waitcnt);
1163   } else {
1164     MBB.push_back(Waitcnt);
1165   }
1166 }
1167
1168 // This is a flat memory operation. Check to see if it has memory
1169 // tokens for both LDS and Memory, and if so mark it as a flat.
1170 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1171   if (MI.memoperands_empty())
1172     return true;
1173
1174   for (const MachineMemOperand *Memop : MI.memoperands()) {
1175     unsigned AS = Memop->getAddrSpace();
1176     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1177       return true;
1178   }
1179
1180   return false;
1181 }
1182
1183 void SIInsertWaitcnts::updateEventWaitCntAfter(
1184     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1185   // Now look at the instruction opcode. If it is a memory access
1186   // instruction, update the upper-bound of the appropriate counter's
1187   // bracket and the destination operand scores.
1188   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1189   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1190     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1191       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1192       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1193     } else {
1194       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1195     }
1196   } else if (TII->isFLAT(Inst)) {
1197     assert(Inst.mayLoad() || Inst.mayStore());
1198
1199     if (TII->usesVM_CNT(Inst))
1200       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1201
1202     if (TII->usesLGKM_CNT(Inst)) {
1203       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1204
1205       // This is a flat memory operation, so note it - it will require
1206       // that both the VM and LGKM be flushed to zero if it is pending when
1207       // a VM or LGKM dependency occurs.
1208       if (mayAccessLDSThroughFlat(Inst))
1209         ScoreBrackets->setPendingFlat();
1210     }
1211   } else if (SIInstrInfo::isVMEM(Inst) &&
1212              // TODO: get a better carve out.
1213              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1214              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1215              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1216     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1217     if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
1218         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1219       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1220     }
1221   } else if (TII->isSMRD(Inst)) {
1222     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1223   } else {
1224     switch (Inst.getOpcode()) {
1225     case AMDGPU::S_SENDMSG:
1226     case AMDGPU::S_SENDMSGHALT:
1227       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1228       break;
1229     case AMDGPU::EXP:
1230     case AMDGPU::EXP_DONE: {
1231       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1232       if (Imm >= 32 && Imm <= 63)
1233         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1234       else if (Imm >= 12 && Imm <= 15)
1235         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1236       else
1237         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1238       break;
1239     }
1240     case AMDGPU::S_MEMTIME:
1241     case AMDGPU::S_MEMREALTIME:
1242       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1243       break;
1244     default:
1245       break;
1246     }
1247   }
1248 }
1249
1250 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1251   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1252   int32_t MaxPending[NUM_INST_CNTS] = {0};
1253   int32_t MaxFlat[NUM_INST_CNTS] = {0};
1254   bool MixedExpTypes = false;
1255
1256   // Clear the score bracket state.
1257   ScoreBrackets->clear();
1258
1259   // Compute the number of pending elements on block entry.
1260
1261   // IMPORTANT NOTE: If iterative handling of loops is added, the code will
1262   // need to handle single BBs with backedges to themselves. This means that
1263   // they will need to retain and not clear their initial state.
1264
1265   // See if there are any uninitialized predecessors. If so, emit an
1266   // s_waitcnt 0 at the beginning of the block.
1267   for (MachineBasicBlock *pred : Block.predecessors()) {
1268     BlockWaitcntBrackets *PredScoreBrackets =
1269         BlockWaitcntBracketsMap[pred].get();
1270     bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
1271     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1272       continue;
1273     }
1274     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1275          T = (enum InstCounterType)(T + 1)) {
1276       int span =
1277           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1278       MaxPending[T] = std::max(MaxPending[T], span);
1279       span =
1280           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1281       MaxFlat[T] = std::max(MaxFlat[T], span);
1282     }
1283
1284     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1285   }
1286
1287   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1288   // Also handle kills for exit block.
1289   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1290     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1291       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1292            T = (enum InstCounterType)(T + 1)) {
1293         int Span = KillWaitBrackets[I]->getScoreUB(T) -
1294                    KillWaitBrackets[I]->getScoreLB(T);
1295         MaxPending[T] = std::max(MaxPending[T], Span);
1296         Span = KillWaitBrackets[I]->pendingFlat(T) -
1297                KillWaitBrackets[I]->getScoreLB(T);
1298         MaxFlat[T] = std::max(MaxFlat[T], Span);
1299       }
1300
1301       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1302     }
1303   }
1304
1305   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1306   for (MachineBasicBlock *Pred : Block.predecessors()) {
1307     BlockWaitcntBrackets *PredScoreBrackets =
1308         BlockWaitcntBracketsMap[Pred].get();
1309     bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
1310     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1311       continue;
1312     }
1313
1314     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1315                   PredScoreBrackets->getScoreLB(EXP_CNT);
1316     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1317     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1318                   PredScoreBrackets->getScoreLB(EXP_CNT);
1319     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1320   }
1321
1322   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1323   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1324     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1325       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1326                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1327       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1328       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1329                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1330       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1331     }
1332   }
1333
1334 #if 0
1335   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1336   // TODO: how does LC distinguish between function entry and main entry?
1337   // If this is the entry to a function, force a wait.
1338   MachineBasicBlock &Entry = Block.getParent()->front();
1339   if (Entry.getNumber() == Block.getNumber()) {
1340     ScoreBrackets->setWaitAtBeginning();
1341     return;
1342   }
1343 #endif
1344
1345   // Now set the current Block's brackets to the largest ending bracket.
1346   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1347        T = (enum InstCounterType)(T + 1)) {
1348     ScoreBrackets->setScoreUB(T, MaxPending[T]);
1349     ScoreBrackets->setScoreLB(T, 0);
1350     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1351   }
1352
1353   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1354
1355   // Set the register scoreboard.
1356   for (MachineBasicBlock *Pred : Block.predecessors()) {
1357     if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
1358       continue;
1359     }
1360
1361     BlockWaitcntBrackets *PredScoreBrackets =
1362         BlockWaitcntBracketsMap[Pred].get();
1363
1364     // Now merge the gpr_reg_score information
1365     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1366          T = (enum InstCounterType)(T + 1)) {
1367       int PredLB = PredScoreBrackets->getScoreLB(T);
1368       int PredUB = PredScoreBrackets->getScoreUB(T);
1369       if (PredLB < PredUB) {
1370         int PredScale = MaxPending[T] - PredUB;
1371         // Merge vgpr scores.
1372         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1373           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1374           if (PredRegScore <= PredLB)
1375             continue;
1376           int NewRegScore = PredScale + PredRegScore;
1377           ScoreBrackets->setRegScore(
1378               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1379         }
1380         // Also need to merge sgpr scores for lgkm_cnt.
1381         if (T == LGKM_CNT) {
1382           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1383             int PredRegScore =
1384                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1385             if (PredRegScore <= PredLB)
1386               continue;
1387             int NewRegScore = PredScale + PredRegScore;
1388             ScoreBrackets->setRegScore(
1389                 J + NUM_ALL_VGPRS, LGKM_CNT,
1390                 std::max(
1391                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1392                     NewRegScore));
1393           }
1394         }
1395       }
1396     }
1397
1398     // Also merge the WaitEvent information.
1399     ForAllWaitEventType(W) {
1400       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1401       int PredEventUB = PredScoreBrackets->getEventUB(W);
1402       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1403         int NewEventUB =
1404             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1405         if (NewEventUB > 0) {
1406           ScoreBrackets->setEventUB(
1407               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1408         }
1409       }
1410     }
1411   }
1412
1413   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1414   // Set the register scoreboard.
1415   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1416     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1417       // Now merge the gpr_reg_score information.
1418       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1419            T = (enum InstCounterType)(T + 1)) {
1420         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1421         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1422         if (PredLB < PredUB) {
1423           int PredScale = MaxPending[T] - PredUB;
1424           // Merge vgpr scores.
1425           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1426             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1427             if (PredRegScore <= PredLB)
1428               continue;
1429             int NewRegScore = PredScale + PredRegScore;
1430             ScoreBrackets->setRegScore(
1431                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1432           }
1433           // Also need to merge sgpr scores for lgkm_cnt.
1434           if (T == LGKM_CNT) {
1435             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1436               int PredRegScore =
1437                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1438               if (PredRegScore <= PredLB)
1439                 continue;
1440               int NewRegScore = PredScale + PredRegScore;
1441               ScoreBrackets->setRegScore(
1442                   J + NUM_ALL_VGPRS, LGKM_CNT,
1443                   std::max(
1444                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1445                       NewRegScore));
1446             }
1447           }
1448         }
1449       }
1450
1451       // Also merge the WaitEvent information.
1452       ForAllWaitEventType(W) {
1453         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1454         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1455         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1456           int NewEventUB =
1457               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1458           if (NewEventUB > 0) {
1459             ScoreBrackets->setEventUB(
1460                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1461           }
1462         }
1463       }
1464     }
1465   }
1466
1467   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1468   // sequencing predecessors, because changes to EXEC require waitcnts due to
1469   // the delayed nature of these operations.
1470   for (MachineBasicBlock *Pred : Block.predecessors()) {
1471     if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
1472       continue;
1473     }
1474
1475     BlockWaitcntBrackets *PredScoreBrackets =
1476         BlockWaitcntBracketsMap[Pred].get();
1477
1478     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1479     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1480       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1481                        PredScoreBrackets->getScoreUB(EXP_CNT);
1482       if (new_gds_ub > 0) {
1483         ScoreBrackets->setEventUB(
1484             GDS_GPR_LOCK,
1485             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1486       }
1487     }
1488     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1489     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1490       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1491                        PredScoreBrackets->getScoreUB(EXP_CNT);
1492       if (new_exp_ub > 0) {
1493         ScoreBrackets->setEventUB(
1494             EXP_GPR_LOCK,
1495             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1496       }
1497     }
1498   }
1499 }
1500
1501 /// Return the "bottom" block of a loop. This differs from
1502 /// MachineLoop::getBottomBlock in that it works even if the loop is
1503 /// discontiguous.
1504 MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
1505   MachineBasicBlock *Bottom = Loop->getHeader();
1506   for (MachineBasicBlock *MBB : Loop->blocks())
1507     if (MBB->getNumber() > Bottom->getNumber())
1508       Bottom = MBB;
1509   return Bottom;
1510 }
1511
1512 // Generate s_waitcnt instructions where needed.
1513 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1514                                             MachineBasicBlock &Block) {
1515   // Initialize the state information.
1516   mergeInputScoreBrackets(Block);
1517
1518   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1519
1520   DEBUG({
1521     dbgs() << "Block" << Block.getNumber();
1522     ScoreBrackets->dump();
1523   });
1524
1525   // Walk over the instructions.
1526   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1527        Iter != E;) {
1528     MachineInstr &Inst = *Iter;
1529     // Remove any previously existing waitcnts.
1530     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1531       // TODO: Register the old waitcnt and optimize the following waitcnts.
1532       // Leaving the previously existing waitcnts is conservatively correct.
1533       if (CompilerGeneratedWaitcntSet.find(&Inst) ==
1534           CompilerGeneratedWaitcntSet.end())
1535         ++Iter;
1536       else {
1537         ScoreBrackets->setWaitcnt(&Inst);
1538         ++Iter;
1539         Inst.removeFromParent();
1540       }
1541       continue;
1542     }
1543
1544     // Kill instructions generate a conditional branch to the endmain block.
1545     // Merge the current waitcnt state into the endmain block information.
1546     // TODO: Are there other flavors of KILL instruction?
1547     if (Inst.getOpcode() == AMDGPU::KILL) {
1548       addKillWaitBracket(ScoreBrackets);
1549     }
1550
1551     bool VCCZBugWorkAround = false;
1552     if (readsVCCZ(Inst) &&
1553         (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
1554       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1555               ScoreBrackets->getScoreUB(LGKM_CNT) &&
1556           ScoreBrackets->hasPendingSMEM()) {
1557         if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
1558           VCCZBugWorkAround = true;
1559       }
1560     }
1561
1562     // Generate an s_waitcnt instruction to be placed before
1563     // cur_Inst, if needed.
1564     MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
1565
1566     if (SWaitInst) {
1567       Block.insert(Inst, SWaitInst);
1568       if (ScoreBrackets->getWaitcnt() != SWaitInst) {
1569         DEBUG(dbgs() << "insertWaitcntInBlock\n"
1570                      << "Old Instr: " << Inst << '\n'
1571                      << "New Instr: " << *SWaitInst << '\n';);
1572       }
1573     }
1574
1575     updateEventWaitCntAfter(Inst, ScoreBrackets);
1576
1577 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1578     // If this instruction generates a S_SETVSKIP because it is an
1579     // indexed resource, and we are on Tahiti, then it will also force
1580     // an S_WAITCNT vmcnt(0)
1581     if (RequireCheckResourceType(Inst, context)) {
1582       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1583       ScoreBrackets->setScoreLB(VM_CNT,
1584       ScoreBrackets->getScoreUB(VM_CNT));
1585     }
1586 #endif
1587
1588     ScoreBrackets->clearWaitcnt();
1589
1590     if (SWaitInst) {
1591       DEBUG({ SWaitInst->print(dbgs() << '\n'); });
1592     }
1593     DEBUG({
1594       Inst.print(dbgs());
1595       ScoreBrackets->dump();
1596     });
1597
1598     // Check to see if this is a GWS instruction. If so, and if this is CI or
1599     // VI, then the generated code sequence will include an S_WAITCNT 0.
1600     // TODO: Are these the only GWS instructions?
1601     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1602         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1603         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1604         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1605         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1606       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1607       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1608       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1609       ScoreBrackets->updateByWait(LGKM_CNT,
1610                                   ScoreBrackets->getScoreUB(LGKM_CNT));
1611     }
1612
1613     // TODO: Remove this work-around after fixing the scheduler and enable the
1614     // assert above.
1615     if (VCCZBugWorkAround) {
1616       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1617       // bit is updated, so we can restore the bit by reading the value of
1618       // vcc and then writing it back to the register.
1619       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1620               AMDGPU::VCC)
1621           .addReg(AMDGPU::VCC);
1622       VCCZBugHandledSet.insert(&Inst);
1623     }
1624
1625     ++Iter;
1626   }
1627
1628   // Check if we need to force convergence at loop footer.
1629   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1630   if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
1631     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1632     WaitcntData->print();
1633     DEBUG(dbgs() << '\n';);
1634
1635     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1636     // placement and doesn't always guarantee convergence for a loop. Each
1637     // loop should take at most 2 iterations for it to converge naturally.
1638     // When this max is reached and result doesn't converge, we force
1639     // convergence by inserting a s_waitcnt at the end of loop footer.
1640     if (WaitcntData->getIterCnt() > 2) {
1641       // To ensure convergence, need to make wait events at loop footer be no
1642       // more than those from the previous iteration.
1643       // As a simplification, Instead of tracking individual scores and
1644       // generate the precise wait count, just wait on 0.
1645       bool HasPending = false;
1646       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1647       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1648            T = (enum InstCounterType)(T + 1)) {
1649         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1650           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1651           HasPending = true;
1652         }
1653       }
1654
1655       if (HasPending) {
1656         if (!SWaitInst) {
1657           SWaitInst = Block.getParent()->CreateMachineInstr(
1658               TII->get(AMDGPU::S_WAITCNT), DebugLoc());
1659           CompilerGeneratedWaitcntSet.insert(SWaitInst);
1660           const MachineOperand &Op = MachineOperand::CreateImm(0);
1661           SWaitInst->addOperand(MF, Op);
1662 #if 0 // TODO: Format the debug output
1663           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1664           OutputTransformAdd(SWaitInst, context);
1665 #endif
1666         }
1667 #if 0 // TODO: ??
1668         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1669 #endif
1670       }
1671
1672       if (SWaitInst) {
1673         DEBUG({
1674           SWaitInst->print(dbgs());
1675           dbgs() << "\nAdjusted score board:";
1676           ScoreBrackets->dump();
1677         });
1678
1679         // Add this waitcnt to the block. It is either newly created or
1680         // created in previous iterations and added back since block traversal
1681         // always remove waitcnt.
1682         insertWaitcntBeforeCF(Block, SWaitInst);
1683         WaitcntData->setWaitcnt(SWaitInst);
1684       }
1685     }
1686   }
1687 }
1688
1689 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1690   ST = &MF.getSubtarget<SISubtarget>();
1691   TII = ST->getInstrInfo();
1692   TRI = &TII->getRegisterInfo();
1693   MRI = &MF.getRegInfo();
1694   MLI = &getAnalysis<MachineLoopInfo>();
1695   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
1696   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1697   AMDGPUASI = ST->getAMDGPUAS();
1698
1699   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1700   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1701   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1702
1703   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1704   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1705   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1706   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1707
1708   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1709   RegisterEncoding.VGPRL =
1710       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1711   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1712   RegisterEncoding.SGPRL =
1713       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1714
1715   // Walk over the blocks in reverse post-dominator order, inserting
1716   // s_waitcnt where needed.
1717   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1718   bool Modified = false;
1719   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1720            I = RPOT.begin(),
1721            E = RPOT.end(), J = RPOT.begin();
1722        I != E;) {
1723     MachineBasicBlock &MBB = **I;
1724
1725     BlockVisitedSet.insert(&MBB);
1726
1727     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1728     if (!ScoreBrackets) {
1729       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
1730       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1731     }
1732     ScoreBrackets->setPostOrder(MBB.getNumber());
1733     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1734     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
1735       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
1736
1737     // If we are walking into the block from before the loop, then guarantee
1738     // at least 1 re-walk over the loop to propagate the information, even if
1739     // no S_WAITCNT instructions were generated.
1740     if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
1741         (BlockWaitcntProcessedSet.find(&MBB) ==
1742          BlockWaitcntProcessedSet.end())) {
1743       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1744       DEBUG(dbgs() << "set-revisit: block"
1745                    << ContainingLoop->getHeader()->getNumber() << '\n';);
1746     }
1747
1748     // Walk over the instructions.
1749     insertWaitcntInBlock(MF, MBB);
1750
1751     // Flag that waitcnts have been processed at least once.
1752     BlockWaitcntProcessedSet.insert(&MBB);
1753
1754     // See if we want to revisit the loop.
1755     if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
1756       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
1757       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1758       if (EntrySB && EntrySB->getRevisitLoop()) {
1759         EntrySB->setRevisitLoop(false);
1760         J = I;
1761         int32_t PostOrder = EntrySB->getPostOrder();
1762         // TODO: Avoid this loop. Find another way to set I.
1763         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1764                  X = RPOT.begin(),
1765                  Y = RPOT.end();
1766              X != Y; ++X) {
1767           MachineBasicBlock &MBBX = **X;
1768           if (MBBX.getNumber() == PostOrder) {
1769             I = X;
1770             break;
1771           }
1772         }
1773         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1774         WaitcntData->incIterCnt();
1775         DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
1776         continue;
1777       } else {
1778         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1779         // Loop converged, reset iteration count. If this loop gets revisited,
1780         // it must be from an outer loop, the counter will restart, this will
1781         // ensure we don't force convergence on such revisits.
1782         WaitcntData->resetIterCnt();
1783       }
1784     }
1785
1786     J = I;
1787     ++I;
1788   }
1789
1790   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1791
1792   bool HaveScalarStores = false;
1793
1794   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1795        ++BI) {
1796     MachineBasicBlock &MBB = *BI;
1797
1798     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1799          ++I) {
1800       if (!HaveScalarStores && TII->isScalarStore(*I))
1801         HaveScalarStores = true;
1802
1803       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1804           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1805         EndPgmBlocks.push_back(&MBB);
1806     }
1807   }
1808
1809   if (HaveScalarStores) {
1810     // If scalar writes are used, the cache must be flushed or else the next
1811     // wave to reuse the same scratch memory can be clobbered.
1812     //
1813     // Insert s_dcache_wb at wave termination points if there were any scalar
1814     // stores, and only if the cache hasn't already been flushed. This could be
1815     // improved by looking across blocks for flushes in postdominating blocks
1816     // from the stores but an explicitly requested flush is probably very rare.
1817     for (MachineBasicBlock *MBB : EndPgmBlocks) {
1818       bool SeenDCacheWB = false;
1819
1820       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1821            ++I) {
1822         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1823           SeenDCacheWB = true;
1824         else if (TII->isScalarStore(*I))
1825           SeenDCacheWB = false;
1826
1827         // FIXME: It would be better to insert this before a waitcnt if any.
1828         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1829              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1830             !SeenDCacheWB) {
1831           Modified = true;
1832           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1833         }
1834       }
1835     }
1836   }
1837
1838   if (!MFI->isEntryFunction()) {
1839     // Wait for any outstanding memory operations that the input registers may
1840     // depend on. We can't track them and it's better to to the wait after the
1841     // costly call sequence.
1842
1843     // TODO: Could insert earlier and schedule more liberally with operations
1844     // that only use caller preserved registers.
1845     MachineBasicBlock &EntryBB = MF.front();
1846     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1847       .addImm(0);
1848
1849     Modified = true;
1850   }
1851
1852   return Modified;
1853 }