contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp

   1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Insert wait instructions for memory reads and writes.
  12 ///
  13 /// Memory reads and writes are issued asynchronously, so we need to insert
  14 /// S_WAITCNT instructions when we want to access any of their results or
  15 /// overwrite any register that's used asynchronously.
  16 //
  17 //===----------------------------------------------------------------------===//
  18
  19 #include "AMDGPU.h"
  20 #include "AMDGPUSubtarget.h"
  21 #include "SIDefines.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIMachineFunctionInfo.h"
  24 #include "SIRegisterInfo.h"
  25 #include "Utils/AMDGPUBaseInfo.h"
  26 #include "llvm/ADT/SmallVector.h"
  27 #include "llvm/ADT/StringRef.h"
  28 #include "llvm/CodeGen/MachineBasicBlock.h"
  29 #include "llvm/CodeGen/MachineFunction.h"
  30 #include "llvm/CodeGen/MachineFunctionPass.h"
  31 #include "llvm/CodeGen/MachineInstr.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineOperand.h"
  34 #include "llvm/CodeGen/MachineRegisterInfo.h"
  35 #include "llvm/IR/DebugLoc.h"
  36 #include "llvm/Pass.h"
  37 #include "llvm/Support/Debug.h"
  38 #include "llvm/Support/raw_ostream.h"
  39 #include "llvm/Target/TargetRegisterInfo.h"
  40 #include <algorithm>
  41 #include <cassert>
  42 #include <cstdint>
  43 #include <cstring>
  44 #include <new>
  45 #include <utility>
  46
  47 #define DEBUG_TYPE "si-insert-waits"
  48
  49 using namespace llvm;
  50
  51 namespace {
  52
  53 /// \brief One variable for each of the hardware counters
  54 typedef union {
  55   struct {
  56     unsigned VM;
  57     unsigned EXP;
  58     unsigned LGKM;
  59   } Named;
  60   unsigned Array[3];
  61 } Counters;
  62
  63 typedef enum {
  64   OTHER,
  65   SMEM,
  66   VMEM
  67 } InstType;
  68
  69 typedef Counters RegCounters[512];
  70 typedef std::pair<unsigned, unsigned> RegInterval;
  71
  72 class SIInsertWaits : public MachineFunctionPass {
  73 private:
  74   const SISubtarget *ST = nullptr;
  75   const SIInstrInfo *TII = nullptr;
  76   const SIRegisterInfo *TRI = nullptr;
  77   const MachineRegisterInfo *MRI;
  78   AMDGPU::IsaInfo::IsaVersion ISA;
  79
  80   /// \brief Constant zero value
  81   static const Counters ZeroCounts;
  82
  83   /// \brief Hardware limits
  84   Counters HardwareLimits;
  85
  86   /// \brief Counter values we have already waited on.
  87   Counters WaitedOn;
  88
  89   /// \brief Counter values that we must wait on before the next counter
  90   /// increase.
  91   Counters DelayedWaitOn;
  92
  93   /// \brief Counter values for last instruction issued.
  94   Counters LastIssued;
  95
  96   /// \brief Registers used by async instructions.
  97   RegCounters UsedRegs;
  98
  99   /// \brief Registers defined by async instructions.
 100   RegCounters DefinedRegs;
 101
 102   /// \brief Different export instruction types seen since last wait.
 103   unsigned ExpInstrTypesSeen = 0;
 104
 105   /// \brief Type of the last opcode.
 106   InstType LastOpcodeType;
 107
 108   bool LastInstWritesM0;
 109
 110   /// Whether or not we have flat operations outstanding.
 111   bool IsFlatOutstanding;
 112
 113   /// \brief Whether the machine function returns void
 114   bool ReturnsVoid;
 115
 116   /// Whether the VCCZ bit is possibly corrupt
 117   bool VCCZCorrupt = false;
 118
 119   /// \brief Get increment/decrement amount for this instruction.
 120   Counters getHwCounts(MachineInstr &MI);
 121
 122   /// \brief Is operand relevant for async execution?
 123   bool isOpRelevant(MachineOperand &Op);
 124
 125   /// \brief Get register interval an operand affects.
 126   RegInterval getRegInterval(const TargetRegisterClass *RC,
 127                              const MachineOperand &Reg) const;
 128
 129   /// \brief Handle instructions async components
 130   void pushInstruction(MachineBasicBlock &MBB,
 131                        MachineBasicBlock::iterator I,
 132                        const Counters& Increment);
 133
 134   /// \brief Insert the actual wait instruction
 135   bool insertWait(MachineBasicBlock &MBB,
 136                   MachineBasicBlock::iterator I,
 137                   const Counters &Counts);
 138
 139   /// \brief Handle existing wait instructions (from intrinsics)
 140   void handleExistingWait(MachineBasicBlock::iterator I);
 141
 142   /// \brief Do we need def2def checks?
 143   bool unorderedDefines(MachineInstr &MI);
 144
 145   /// \brief Resolve all operand dependencies to counter requirements
 146   Counters handleOperands(MachineInstr &MI);
 147
 148   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
 149   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 150
 151   /// Return true if there are LGKM instrucitons that haven't been waited on
 152   /// yet.
 153   bool hasOutstandingLGKM() const;
 154
 155 public:
 156   static char ID;
 157
 158   SIInsertWaits() : MachineFunctionPass(ID) {}
 159
 160   bool runOnMachineFunction(MachineFunction &MF) override;
 161
 162   StringRef getPassName() const override {
 163     return "SI insert wait instructions";
 164   }
 165
 166   void getAnalysisUsage(AnalysisUsage &AU) const override {
 167     AU.setPreservesCFG();
 168     MachineFunctionPass::getAnalysisUsage(AU);
 169   }
 170 };
 171
 172 } // end anonymous namespace
 173
 174 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
 175                       "SI Insert Waits", false, false)
 176 INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
 177                     "SI Insert Waits", false, false)
 178
 179 char SIInsertWaits::ID = 0;
 180
 181 char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
 182
 183 FunctionPass *llvm::createSIInsertWaitsPass() {
 184   return new SIInsertWaits();
 185 }
 186
 187 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
 188
 189 static bool readsVCCZ(const MachineInstr &MI) {
 190   unsigned Opc = MI.getOpcode();
 191   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
 192          !MI.getOperand(1).isUndef();
 193 }
 194
 195 bool SIInsertWaits::hasOutstandingLGKM() const {
 196   return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
 197 }
 198
 199 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 200   uint64_t TSFlags = MI.getDesc().TSFlags;
 201   Counters Result = { { 0, 0, 0 } };
 202
 203   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
 204
 205   // Only consider stores or EXP for EXP_CNT
 206   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
 207
 208   // LGKM may uses larger values
 209   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 210
 211     if (TII->isSMRD(MI)) {
 212
 213       if (MI.getNumOperands() != 0) {
 214         assert(MI.getOperand(0).isReg() &&
 215                "First LGKM operand must be a register!");
 216
 217         // XXX - What if this is a write into a super register?
 218         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
 219         unsigned Size = TRI->getRegSizeInBits(*RC);
 220         Result.Named.LGKM = Size > 32 ? 2 : 1;
 221       } else {
 222         // s_dcache_inv etc. do not have a a destination register. Assume we
 223         // want a wait on these.
 224         // XXX - What is the right value?
 225         Result.Named.LGKM = 1;
 226       }
 227     } else {
 228       // DS
 229       Result.Named.LGKM = 1;
 230     }
 231
 232   } else {
 233     Result.Named.LGKM = 0;
 234   }
 235
 236   return Result;
 237 }
 238
 239 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 240   // Constants are always irrelevant
 241   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 242     return false;
 243
 244   // Defines are always relevant
 245   if (Op.isDef())
 246     return true;
 247
 248   // For exports all registers are relevant.
 249   // TODO: Skip undef/disabled registers.
 250   MachineInstr &MI = *Op.getParent();
 251   if (TII->isEXP(MI))
 252     return true;
 253
 254   // For stores the stored value is also relevant
 255   if (!MI.getDesc().mayStore())
 256     return false;
 257
 258   // Check if this operand is the value being stored.
 259   // Special case for DS/FLAT instructions, since the address
 260   // operand comes before the value operand and it may have
 261   // multiple data operands.
 262
 263   if (TII->isDS(MI)) {
 264     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
 265     if (Data0 && Op.isIdenticalTo(*Data0))
 266       return true;
 267
 268     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
 269     return Data1 && Op.isIdenticalTo(*Data1);
 270   }
 271
 272   if (TII->isFLAT(MI)) {
 273     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
 274     if (Data && Op.isIdenticalTo(*Data))
 275       return true;
 276   }
 277
 278   // NOTE: This assumes that the value operand is before the
 279   // address operand, and that there is only one value operand.
 280   for (MachineInstr::mop_iterator I = MI.operands_begin(),
 281        E = MI.operands_end(); I != E; ++I) {
 282
 283     if (I->isReg() && I->isUse())
 284       return Op.isIdenticalTo(*I);
 285   }
 286
 287   return false;
 288 }
 289
 290 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 291                                           const MachineOperand &Reg) const {
 292   unsigned Size = TRI->getRegSizeInBits(*RC);
 293   assert(Size >= 32);
 294
 295   RegInterval Result;
 296   Result.first = TRI->getEncodingValue(Reg.getReg());
 297   Result.second = Result.first + Size / 32;
 298
 299   return Result;
 300 }
 301
 302 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 303                                     MachineBasicBlock::iterator I,
 304                                     const Counters &Increment) {
 305   // Get the hardware counter increments and sum them up
 306   Counters Limit = ZeroCounts;
 307   unsigned Sum = 0;
 308
 309   if (TII->mayAccessFlatAddressSpace(*I))
 310     IsFlatOutstanding = true;
 311
 312   for (unsigned i = 0; i < 3; ++i) {
 313     LastIssued.Array[i] += Increment.Array[i];
 314     if (Increment.Array[i])
 315       Limit.Array[i] = LastIssued.Array[i];
 316     Sum += Increment.Array[i];
 317   }
 318
 319   // If we don't increase anything then that's it
 320   if (Sum == 0) {
 321     LastOpcodeType = OTHER;
 322     return;
 323   }
 324
 325   if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
 326     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
 327     // or SMEM clause, respectively.
 328     //
 329     // The temporary workaround is to break the clauses with S_NOP.
 330     //
 331     // The proper solution would be to allocate registers such that all source
 332     // and destination registers don't overlap, e.g. this is illegal:
 333     //   r0 = load r2
 334     //   r2 = load r0
 335     if (LastOpcodeType == VMEM && Increment.Named.VM) {
 336       // Insert a NOP to break the clause.
 337       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
 338           .addImm(0);
 339       LastInstWritesM0 = false;
 340     }
 341
 342     if (TII->isSMRD(*I))
 343       LastOpcodeType = SMEM;
 344     else if (Increment.Named.VM)
 345       LastOpcodeType = VMEM;
 346   }
 347
 348   // Remember which export instructions we have seen
 349   if (Increment.Named.EXP) {
 350     ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
 351   }
 352
 353   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
 354     MachineOperand &Op = I->getOperand(i);
 355     if (!isOpRelevant(Op))
 356       continue;
 357
 358     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
 359     RegInterval Interval = getRegInterval(RC, Op);
 360     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 361
 362       // Remember which registers we define
 363       if (Op.isDef())
 364         DefinedRegs[j] = Limit;
 365
 366       // and which one we are using
 367       if (Op.isUse())
 368         UsedRegs[j] = Limit;
 369     }
 370   }
 371 }
 372
 373 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 374                                MachineBasicBlock::iterator I,
 375                                const Counters &Required) {
 376   // End of program? No need to wait on anything
 377   // A function not returning void needs to wait, because other bytecode will
 378   // be appended after it and we don't know what it will be.
 379   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
 380     return false;
 381
 382   // Figure out if the async instructions execute in order
 383   bool Ordered[3];
 384
 385   // VM_CNT is always ordered except when there are flat instructions, which
 386   // can return out of order.
 387   Ordered[0] = !IsFlatOutstanding;
 388
 389   // EXP_CNT is unordered if we have both EXP & VM-writes
 390   Ordered[1] = ExpInstrTypesSeen == 3;
 391
 392   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
 393   Ordered[2] = false;
 394
 395   // The values we are going to put into the S_WAITCNT instruction
 396   Counters Counts = HardwareLimits;
 397
 398   // Do we really need to wait?
 399   bool NeedWait = false;
 400
 401   for (unsigned i = 0; i < 3; ++i) {
 402     if (Required.Array[i] <= WaitedOn.Array[i])
 403       continue;
 404
 405     NeedWait = true;
 406
 407     if (Ordered[i]) {
 408       unsigned Value = LastIssued.Array[i] - Required.Array[i];
 409
 410       // Adjust the value to the real hardware possibilities.
 411       Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
 412
 413     } else
 414       Counts.Array[i] = 0;
 415
 416     // Remember on what we have waited on.
 417     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
 418   }
 419
 420   if (!NeedWait)
 421     return false;
 422
 423   // Reset EXP_CNT instruction types
 424   if (Counts.Named.EXP == 0)
 425     ExpInstrTypesSeen = 0;
 426
 427   // Build the wait instruction
 428   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
 429     .addImm(AMDGPU::encodeWaitcnt(ISA,
 430                                   Counts.Named.VM,
 431                                   Counts.Named.EXP,
 432                                   Counts.Named.LGKM));
 433
 434   LastOpcodeType = OTHER;
 435   LastInstWritesM0 = false;
 436   IsFlatOutstanding = false;
 437   return true;
 438 }
 439
 440 /// \brief helper function for handleOperands
 441 static void increaseCounters(Counters &Dst, const Counters &Src) {
 442   for (unsigned i = 0; i < 3; ++i)
 443     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 444 }
 445
 446 /// \brief check whether any of the counters is non-zero
 447 static bool countersNonZero(const Counters &Counter) {
 448   for (unsigned i = 0; i < 3; ++i)
 449     if (Counter.Array[i])
 450       return true;
 451   return false;
 452 }
 453
 454 void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
 455   assert(I->getOpcode() == AMDGPU::S_WAITCNT);
 456
 457   unsigned Imm = I->getOperand(0).getImm();
 458   Counters Counts, WaitOn;
 459
 460   Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
 461   Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
 462   Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
 463
 464   for (unsigned i = 0; i < 3; ++i) {
 465     if (Counts.Array[i] <= LastIssued.Array[i])
 466       WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
 467     else
 468       WaitOn.Array[i] = 0;
 469   }
 470
 471   increaseCounters(DelayedWaitOn, WaitOn);
 472 }
 473
 474 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 475   Counters Result = ZeroCounts;
 476
 477   // For each register affected by this instruction increase the result
 478   // sequence.
 479   //
 480   // TODO: We could probably just look at explicit operands if we removed VCC /
 481   // EXEC from SMRD dest reg classes.
 482   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
 483     MachineOperand &Op = MI.getOperand(i);
 484     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 485       continue;
 486
 487     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
 488     RegInterval Interval = getRegInterval(RC, Op);
 489     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 490       if (Op.isDef()) {
 491         increaseCounters(Result, UsedRegs[j]);
 492         increaseCounters(Result, DefinedRegs[j]);
 493       }
 494
 495       if (Op.isUse())
 496         increaseCounters(Result, DefinedRegs[j]);
 497     }
 498   }
 499
 500   return Result;
 501 }
 502
 503 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
 504                                   MachineBasicBlock::iterator I) {
 505   if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
 506     return;
 507
 508   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
 509   if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
 510     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
 511     LastInstWritesM0 = false;
 512     return;
 513   }
 514
 515   // Set whether this instruction sets M0
 516   LastInstWritesM0 = false;
 517
 518   unsigned NumOperands = I->getNumOperands();
 519   for (unsigned i = 0; i < NumOperands; i++) {
 520     const MachineOperand &Op = I->getOperand(i);
 521
 522     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
 523       LastInstWritesM0 = true;
 524   }
 525 }
 526
 527 /// Return true if \p MBB has one successor immediately following, and is its
 528 /// only predecessor
 529 static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
 530   if (MBB.succ_size() != 1)
 531     return false;
 532
 533   const MachineBasicBlock *Succ = *MBB.succ_begin();
 534   return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
 535 }
 536
 537 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 538 // around other non-memory instructions.
 539 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 540   bool Changes = false;
 541
 542   ST = &MF.getSubtarget<SISubtarget>();
 543   TII = ST->getInstrInfo();
 544   TRI = &TII->getRegisterInfo();
 545   MRI = &MF.getRegInfo();
 546   ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
 547   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 548
 549   HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
 550   HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
 551   HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
 552
 553   WaitedOn = ZeroCounts;
 554   DelayedWaitOn = ZeroCounts;
 555   LastIssued = ZeroCounts;
 556   LastOpcodeType = OTHER;
 557   LastInstWritesM0 = false;
 558   IsFlatOutstanding = false;
 559   ReturnsVoid = MFI->returnsVoid();
 560
 561   memset(&UsedRegs, 0, sizeof(UsedRegs));
 562   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 563
 564   SmallVector<MachineInstr *, 4> RemoveMI;
 565   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
 566
 567   bool HaveScalarStores = false;
 568
 569   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 570        BI != BE; ++BI) {
 571
 572     MachineBasicBlock &MBB = *BI;
 573
 574     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 575          I != E; ++I) {
 576
 577       if (!HaveScalarStores && TII->isScalarStore(*I))
 578         HaveScalarStores = true;
 579
 580       if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
 581         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
 582         // vccz bit, so when we detect that an instruction may read from a
 583         // corrupt vccz bit, we need to:
 584         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
 585         //    complete.
 586         // 2. Restore the correct value of vccz by writing the current value
 587         //    of vcc back to vcc.
 588
 589         if (TII->isSMRD(I->getOpcode())) {
 590           VCCZCorrupt = true;
 591         } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
 592           // FIXME: We only care about SMRD instructions here, not LDS or GDS.
 593           // Whenever we store a value in vcc, the correct value of vccz is
 594           // restored.
 595           VCCZCorrupt = false;
 596         }
 597
 598         // Check if we need to apply the bug work-around
 599         if (VCCZCorrupt && readsVCCZ(*I)) {
 600           DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
 601
 602           // Wait on everything, not just LGKM.  vccz reads usually come from
 603           // terminators, and we always wait on everything at the end of the
 604           // block, so if we only wait on LGKM here, we might end up with
 605           // another s_waitcnt inserted right after this if there are non-LGKM
 606           // instructions still outstanding.
 607           insertWait(MBB, I, LastIssued);
 608
 609           // Restore the vccz bit.  Any time a value is written to vcc, the vcc
 610           // bit is updated, so we can restore the bit by reading the value of
 611           // vcc and then writing it back to the register.
 612           BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
 613                   AMDGPU::VCC)
 614             .addReg(AMDGPU::VCC);
 615         }
 616       }
 617
 618       // Record pre-existing, explicitly requested waits
 619       if (I->getOpcode() == AMDGPU::S_WAITCNT) {
 620         handleExistingWait(*I);
 621         RemoveMI.push_back(&*I);
 622         continue;
 623       }
 624
 625       Counters Required;
 626
 627       // Wait for everything before a barrier.
 628       //
 629       // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
 630       // but we also want to wait for any other outstanding transfers before
 631       // signalling other hardware blocks
 632       if ((I->getOpcode() == AMDGPU::S_BARRIER &&
 633                ST->needWaitcntBeforeBarrier()) ||
 634            I->getOpcode() == AMDGPU::S_SENDMSG ||
 635            I->getOpcode() == AMDGPU::S_SENDMSGHALT)
 636         Required = LastIssued;
 637       else
 638         Required = handleOperands(*I);
 639
 640       Counters Increment = getHwCounts(*I);
 641
 642       if (countersNonZero(Required) || countersNonZero(Increment))
 643         increaseCounters(Required, DelayedWaitOn);
 644
 645       Changes |= insertWait(MBB, I, Required);
 646
 647       pushInstruction(MBB, I, Increment);
 648       handleSendMsg(MBB, I);
 649
 650       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
 651           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
 652         EndPgmBlocks.push_back(&MBB);
 653     }
 654
 655     // Wait for everything at the end of the MBB. If there is only one
 656     // successor, we can defer this until the uses there.
 657     if (!hasTrivialSuccessor(MBB))
 658       Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
 659   }
 660
 661   if (HaveScalarStores) {
 662     // If scalar writes are used, the cache must be flushed or else the next
 663     // wave to reuse the same scratch memory can be clobbered.
 664     //
 665     // Insert s_dcache_wb at wave termination points if there were any scalar
 666     // stores, and only if the cache hasn't already been flushed. This could be
 667     // improved by looking across blocks for flushes in postdominating blocks
 668     // from the stores but an explicitly requested flush is probably very rare.
 669     for (MachineBasicBlock *MBB : EndPgmBlocks) {
 670       bool SeenDCacheWB = false;
 671
 672       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
 673            I != E; ++I) {
 674
 675         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
 676           SeenDCacheWB = true;
 677         else if (TII->isScalarStore(*I))
 678           SeenDCacheWB = false;
 679
 680         // FIXME: It would be better to insert this before a waitcnt if any.
 681         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
 682              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
 683           Changes = true;
 684           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
 685         }
 686       }
 687     }
 688   }
 689
 690   for (MachineInstr *I : RemoveMI)
 691     I->eraseFromParent();
 692
 693   if (!MFI->isEntryFunction()) {
 694     // Wait for any outstanding memory operations that the input registers may
 695     // depend on. We can't track them and it's better to to the wait after the
 696     // costly call sequence.
 697
 698     // TODO: Could insert earlier and schedule more liberally with operations
 699     // that only use caller preserved registers.
 700     MachineBasicBlock &EntryBB = MF.front();
 701     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
 702       .addImm(0);
 703
 704     Changes = true;
 705   }
 706
 707   return Changes;
 708 }