contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp

   1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Insert wait instructions for memory reads and writes.
  12 ///
  13 /// Memory reads and writes are issued asynchronously, so we need to insert
  14 /// S_WAITCNT instructions when we want to access any of their results or
  15 /// overwrite any register that's used asynchronously.
  16 //
  17 //===----------------------------------------------------------------------===//
  18
  19 #include "AMDGPU.h"
  20 #include "AMDGPUSubtarget.h"
  21 #include "SIDefines.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIMachineFunctionInfo.h"
  24 #include "Utils/AMDGPUBaseInfo.h"
  25 #include "llvm/CodeGen/MachineFunction.h"
  26 #include "llvm/CodeGen/MachineFunctionPass.h"
  27 #include "llvm/CodeGen/MachineInstrBuilder.h"
  28 #include "llvm/CodeGen/MachineRegisterInfo.h"
  29
  30 #define DEBUG_TYPE "si-insert-waits"
  31
  32 using namespace llvm;
  33 using namespace llvm::AMDGPU;
  34
  35 namespace {
  36
  37 /// \brief One variable for each of the hardware counters
  38 typedef union {
  39   struct {
  40     unsigned VM;
  41     unsigned EXP;
  42     unsigned LGKM;
  43   } Named;
  44   unsigned Array[3];
  45
  46 } Counters;
  47
  48 typedef enum {
  49   OTHER,
  50   SMEM,
  51   VMEM
  52 } InstType;
  53
  54 typedef Counters RegCounters[512];
  55 typedef std::pair<unsigned, unsigned> RegInterval;
  56
  57 class SIInsertWaits : public MachineFunctionPass {
  58
  59 private:
  60   const SISubtarget *ST;
  61   const SIInstrInfo *TII;
  62   const SIRegisterInfo *TRI;
  63   const MachineRegisterInfo *MRI;
  64   IsaVersion IV;
  65
  66   /// \brief Constant zero value
  67   static const Counters ZeroCounts;
  68
  69   /// \brief Hardware limits
  70   Counters HardwareLimits;
  71
  72   /// \brief Counter values we have already waited on.
  73   Counters WaitedOn;
  74
  75   /// \brief Counter values that we must wait on before the next counter
  76   /// increase.
  77   Counters DelayedWaitOn;
  78
  79   /// \brief Counter values for last instruction issued.
  80   Counters LastIssued;
  81
  82   /// \brief Registers used by async instructions.
  83   RegCounters UsedRegs;
  84
  85   /// \brief Registers defined by async instructions.
  86   RegCounters DefinedRegs;
  87
  88   /// \brief Different export instruction types seen since last wait.
  89   unsigned ExpInstrTypesSeen;
  90
  91   /// \brief Type of the last opcode.
  92   InstType LastOpcodeType;
  93
  94   bool LastInstWritesM0;
  95
  96   /// Whether or not we have flat operations outstanding.
  97   bool IsFlatOutstanding;
  98
  99   /// \brief Whether the machine function returns void
 100   bool ReturnsVoid;
 101
 102   /// Whether the VCCZ bit is possibly corrupt
 103   bool VCCZCorrupt;
 104
 105   /// \brief Get increment/decrement amount for this instruction.
 106   Counters getHwCounts(MachineInstr &MI);
 107
 108   /// \brief Is operand relevant for async execution?
 109   bool isOpRelevant(MachineOperand &Op);
 110
 111   /// \brief Get register interval an operand affects.
 112   RegInterval getRegInterval(const TargetRegisterClass *RC,
 113                              const MachineOperand &Reg) const;
 114
 115   /// \brief Handle instructions async components
 116   void pushInstruction(MachineBasicBlock &MBB,
 117                        MachineBasicBlock::iterator I,
 118                        const Counters& Increment);
 119
 120   /// \brief Insert the actual wait instruction
 121   bool insertWait(MachineBasicBlock &MBB,
 122                   MachineBasicBlock::iterator I,
 123                   const Counters &Counts);
 124
 125   /// \brief Handle existing wait instructions (from intrinsics)
 126   void handleExistingWait(MachineBasicBlock::iterator I);
 127
 128   /// \brief Do we need def2def checks?
 129   bool unorderedDefines(MachineInstr &MI);
 130
 131   /// \brief Resolve all operand dependencies to counter requirements
 132   Counters handleOperands(MachineInstr &MI);
 133
 134   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
 135   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 136
 137   /// Return true if there are LGKM instrucitons that haven't been waited on
 138   /// yet.
 139   bool hasOutstandingLGKM() const;
 140
 141 public:
 142   static char ID;
 143
 144   SIInsertWaits() :
 145     MachineFunctionPass(ID),
 146     ST(nullptr),
 147     TII(nullptr),
 148     TRI(nullptr),
 149     ExpInstrTypesSeen(0),
 150     VCCZCorrupt(false) { }
 151
 152   bool runOnMachineFunction(MachineFunction &MF) override;
 153
 154   StringRef getPassName() const override {
 155     return "SI insert wait instructions";
 156   }
 157
 158   void getAnalysisUsage(AnalysisUsage &AU) const override {
 159     AU.setPreservesCFG();
 160     MachineFunctionPass::getAnalysisUsage(AU);
 161   }
 162 };
 163
 164 } // End anonymous namespace
 165
 166 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
 167                       "SI Insert Waits", false, false)
 168 INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
 169                     "SI Insert Waits", false, false)
 170
 171 char SIInsertWaits::ID = 0;
 172
 173 char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
 174
 175 FunctionPass *llvm::createSIInsertWaitsPass() {
 176   return new SIInsertWaits();
 177 }
 178
 179 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
 180
 181 static bool readsVCCZ(const MachineInstr &MI) {
 182   unsigned Opc = MI.getOpcode();
 183   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
 184          !MI.getOperand(1).isUndef();
 185 }
 186
 187 bool SIInsertWaits::hasOutstandingLGKM() const {
 188   return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
 189 }
 190
 191 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 192   uint64_t TSFlags = MI.getDesc().TSFlags;
 193   Counters Result = { { 0, 0, 0 } };
 194
 195   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
 196
 197   // Only consider stores or EXP for EXP_CNT
 198   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
 199
 200   // LGKM may uses larger values
 201   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 202
 203     if (TII->isSMRD(MI)) {
 204
 205       if (MI.getNumOperands() != 0) {
 206         assert(MI.getOperand(0).isReg() &&
 207                "First LGKM operand must be a register!");
 208
 209         // XXX - What if this is a write into a super register?
 210         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
 211         unsigned Size = RC->getSize();
 212         Result.Named.LGKM = Size > 4 ? 2 : 1;
 213       } else {
 214         // s_dcache_inv etc. do not have a a destination register. Assume we
 215         // want a wait on these.
 216         // XXX - What is the right value?
 217         Result.Named.LGKM = 1;
 218       }
 219     } else {
 220       // DS
 221       Result.Named.LGKM = 1;
 222     }
 223
 224   } else {
 225     Result.Named.LGKM = 0;
 226   }
 227
 228   return Result;
 229 }
 230
 231 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 232   // Constants are always irrelevant
 233   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 234     return false;
 235
 236   // Defines are always relevant
 237   if (Op.isDef())
 238     return true;
 239
 240   // For exports all registers are relevant.
 241   // TODO: Skip undef/disabled registers.
 242   MachineInstr &MI = *Op.getParent();
 243   if (TII->isEXP(MI))
 244     return true;
 245
 246   // For stores the stored value is also relevant
 247   if (!MI.getDesc().mayStore())
 248     return false;
 249
 250   // Check if this operand is the value being stored.
 251   // Special case for DS/FLAT instructions, since the address
 252   // operand comes before the value operand and it may have
 253   // multiple data operands.
 254
 255   if (TII->isDS(MI)) {
 256     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
 257     if (Data0 && Op.isIdenticalTo(*Data0))
 258       return true;
 259
 260     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
 261     return Data1 && Op.isIdenticalTo(*Data1);
 262   }
 263
 264   if (TII->isFLAT(MI)) {
 265     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
 266     if (Data && Op.isIdenticalTo(*Data))
 267       return true;
 268   }
 269
 270   // NOTE: This assumes that the value operand is before the
 271   // address operand, and that there is only one value operand.
 272   for (MachineInstr::mop_iterator I = MI.operands_begin(),
 273        E = MI.operands_end(); I != E; ++I) {
 274
 275     if (I->isReg() && I->isUse())
 276       return Op.isIdenticalTo(*I);
 277   }
 278
 279   return false;
 280 }
 281
 282 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 283                                           const MachineOperand &Reg) const {
 284   unsigned Size = RC->getSize();
 285   assert(Size >= 4);
 286
 287   RegInterval Result;
 288   Result.first = TRI->getEncodingValue(Reg.getReg());
 289   Result.second = Result.first + Size / 4;
 290
 291   return Result;
 292 }
 293
 294 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 295                                     MachineBasicBlock::iterator I,
 296                                     const Counters &Increment) {
 297
 298   // Get the hardware counter increments and sum them up
 299   Counters Limit = ZeroCounts;
 300   unsigned Sum = 0;
 301
 302   if (TII->mayAccessFlatAddressSpace(*I))
 303     IsFlatOutstanding = true;
 304
 305   for (unsigned i = 0; i < 3; ++i) {
 306     LastIssued.Array[i] += Increment.Array[i];
 307     if (Increment.Array[i])
 308       Limit.Array[i] = LastIssued.Array[i];
 309     Sum += Increment.Array[i];
 310   }
 311
 312   // If we don't increase anything then that's it
 313   if (Sum == 0) {
 314     LastOpcodeType = OTHER;
 315     return;
 316   }
 317
 318   if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
 319     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
 320     // or SMEM clause, respectively.
 321     //
 322     // The temporary workaround is to break the clauses with S_NOP.
 323     //
 324     // The proper solution would be to allocate registers such that all source
 325     // and destination registers don't overlap, e.g. this is illegal:
 326     //   r0 = load r2
 327     //   r2 = load r0
 328     if (LastOpcodeType == VMEM && Increment.Named.VM) {
 329       // Insert a NOP to break the clause.
 330       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
 331           .addImm(0);
 332       LastInstWritesM0 = false;
 333     }
 334
 335     if (TII->isSMRD(*I))
 336       LastOpcodeType = SMEM;
 337     else if (Increment.Named.VM)
 338       LastOpcodeType = VMEM;
 339   }
 340
 341   // Remember which export instructions we have seen
 342   if (Increment.Named.EXP) {
 343     ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
 344   }
 345
 346   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
 347     MachineOperand &Op = I->getOperand(i);
 348     if (!isOpRelevant(Op))
 349       continue;
 350
 351     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
 352     RegInterval Interval = getRegInterval(RC, Op);
 353     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 354
 355       // Remember which registers we define
 356       if (Op.isDef())
 357         DefinedRegs[j] = Limit;
 358
 359       // and which one we are using
 360       if (Op.isUse())
 361         UsedRegs[j] = Limit;
 362     }
 363   }
 364 }
 365
 366 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 367                                MachineBasicBlock::iterator I,
 368                                const Counters &Required) {
 369
 370   // End of program? No need to wait on anything
 371   // A function not returning void needs to wait, because other bytecode will
 372   // be appended after it and we don't know what it will be.
 373   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
 374     return false;
 375
 376   // Figure out if the async instructions execute in order
 377   bool Ordered[3];
 378
 379   // VM_CNT is always ordered except when there are flat instructions, which
 380   // can return out of order.
 381   Ordered[0] = !IsFlatOutstanding;
 382
 383   // EXP_CNT is unordered if we have both EXP & VM-writes
 384   Ordered[1] = ExpInstrTypesSeen == 3;
 385
 386   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
 387   Ordered[2] = false;
 388
 389   // The values we are going to put into the S_WAITCNT instruction
 390   Counters Counts = HardwareLimits;
 391
 392   // Do we really need to wait?
 393   bool NeedWait = false;
 394
 395   for (unsigned i = 0; i < 3; ++i) {
 396
 397     if (Required.Array[i] <= WaitedOn.Array[i])
 398       continue;
 399
 400     NeedWait = true;
 401
 402     if (Ordered[i]) {
 403       unsigned Value = LastIssued.Array[i] - Required.Array[i];
 404
 405       // Adjust the value to the real hardware possibilities.
 406       Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
 407
 408     } else
 409       Counts.Array[i] = 0;
 410
 411     // Remember on what we have waited on.
 412     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
 413   }
 414
 415   if (!NeedWait)
 416     return false;
 417
 418   // Reset EXP_CNT instruction types
 419   if (Counts.Named.EXP == 0)
 420     ExpInstrTypesSeen = 0;
 421
 422   // Build the wait instruction
 423   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
 424     .addImm(encodeWaitcnt(IV,
 425                           Counts.Named.VM,
 426                           Counts.Named.EXP,
 427                           Counts.Named.LGKM));
 428
 429   LastOpcodeType = OTHER;
 430   LastInstWritesM0 = false;
 431   IsFlatOutstanding = false;
 432   return true;
 433 }
 434
 435 /// \brief helper function for handleOperands
 436 static void increaseCounters(Counters &Dst, const Counters &Src) {
 437
 438   for (unsigned i = 0; i < 3; ++i)
 439     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 440 }
 441
 442 /// \brief check whether any of the counters is non-zero
 443 static bool countersNonZero(const Counters &Counter) {
 444   for (unsigned i = 0; i < 3; ++i)
 445     if (Counter.Array[i])
 446       return true;
 447   return false;
 448 }
 449
 450 void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
 451   assert(I->getOpcode() == AMDGPU::S_WAITCNT);
 452
 453   unsigned Imm = I->getOperand(0).getImm();
 454   Counters Counts, WaitOn;
 455
 456   Counts.Named.VM = decodeVmcnt(IV, Imm);
 457   Counts.Named.EXP = decodeExpcnt(IV, Imm);
 458   Counts.Named.LGKM = decodeLgkmcnt(IV, Imm);
 459
 460   for (unsigned i = 0; i < 3; ++i) {
 461     if (Counts.Array[i] <= LastIssued.Array[i])
 462       WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
 463     else
 464       WaitOn.Array[i] = 0;
 465   }
 466
 467   increaseCounters(DelayedWaitOn, WaitOn);
 468 }
 469
 470 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 471
 472   Counters Result = ZeroCounts;
 473
 474   // For each register affected by this instruction increase the result
 475   // sequence.
 476   //
 477   // TODO: We could probably just look at explicit operands if we removed VCC /
 478   // EXEC from SMRD dest reg classes.
 479   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
 480     MachineOperand &Op = MI.getOperand(i);
 481     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 482       continue;
 483
 484     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
 485     RegInterval Interval = getRegInterval(RC, Op);
 486     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 487
 488       if (Op.isDef()) {
 489         increaseCounters(Result, UsedRegs[j]);
 490         increaseCounters(Result, DefinedRegs[j]);
 491       }
 492
 493       if (Op.isUse())
 494         increaseCounters(Result, DefinedRegs[j]);
 495     }
 496   }
 497
 498   return Result;
 499 }
 500
 501 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
 502                                   MachineBasicBlock::iterator I) {
 503   if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
 504     return;
 505
 506   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
 507   if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
 508     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
 509     LastInstWritesM0 = false;
 510     return;
 511   }
 512
 513   // Set whether this instruction sets M0
 514   LastInstWritesM0 = false;
 515
 516   unsigned NumOperands = I->getNumOperands();
 517   for (unsigned i = 0; i < NumOperands; i++) {
 518     const MachineOperand &Op = I->getOperand(i);
 519
 520     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
 521       LastInstWritesM0 = true;
 522   }
 523 }
 524
 525 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 526 // around other non-memory instructions.
 527 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 528   bool Changes = false;
 529
 530   ST = &MF.getSubtarget<SISubtarget>();
 531   TII = ST->getInstrInfo();
 532   TRI = &TII->getRegisterInfo();
 533   MRI = &MF.getRegInfo();
 534   IV = getIsaVersion(ST->getFeatureBits());
 535   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 536
 537   HardwareLimits.Named.VM = getVmcntBitMask(IV);
 538   HardwareLimits.Named.EXP = getExpcntBitMask(IV);
 539   HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV);
 540
 541   WaitedOn = ZeroCounts;
 542   DelayedWaitOn = ZeroCounts;
 543   LastIssued = ZeroCounts;
 544   LastOpcodeType = OTHER;
 545   LastInstWritesM0 = false;
 546   IsFlatOutstanding = false;
 547   ReturnsVoid = MFI->returnsVoid();
 548
 549   memset(&UsedRegs, 0, sizeof(UsedRegs));
 550   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 551
 552   SmallVector<MachineInstr *, 4> RemoveMI;
 553   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
 554
 555   bool HaveScalarStores = false;
 556
 557   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 558        BI != BE; ++BI) {
 559
 560     MachineBasicBlock &MBB = *BI;
 561
 562     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 563          I != E; ++I) {
 564
 565       if (!HaveScalarStores && TII->isScalarStore(*I))
 566         HaveScalarStores = true;
 567
 568       if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
 569         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
 570         // vccz bit, so when we detect that an instruction may read from a
 571         // corrupt vccz bit, we need to:
 572         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
 573         //    complete.
 574         // 2. Restore the correct value of vccz by writing the current value
 575         //    of vcc back to vcc.
 576
 577         if (TII->isSMRD(I->getOpcode())) {
 578           VCCZCorrupt = true;
 579         } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
 580           // FIXME: We only care about SMRD instructions here, not LDS or GDS.
 581           // Whenever we store a value in vcc, the correct value of vccz is
 582           // restored.
 583           VCCZCorrupt = false;
 584         }
 585
 586         // Check if we need to apply the bug work-around
 587         if (VCCZCorrupt && readsVCCZ(*I)) {
 588           DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
 589
 590           // Wait on everything, not just LGKM.  vccz reads usually come from
 591           // terminators, and we always wait on everything at the end of the
 592           // block, so if we only wait on LGKM here, we might end up with
 593           // another s_waitcnt inserted right after this if there are non-LGKM
 594           // instructions still outstanding.
 595           insertWait(MBB, I, LastIssued);
 596
 597           // Restore the vccz bit.  Any time a value is written to vcc, the vcc
 598           // bit is updated, so we can restore the bit by reading the value of
 599           // vcc and then writing it back to the register.
 600           BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
 601                   AMDGPU::VCC)
 602             .addReg(AMDGPU::VCC);
 603         }
 604       }
 605
 606       // Record pre-existing, explicitly requested waits
 607       if (I->getOpcode() == AMDGPU::S_WAITCNT) {
 608         handleExistingWait(*I);
 609         RemoveMI.push_back(&*I);
 610         continue;
 611       }
 612
 613       Counters Required;
 614
 615       // Wait for everything before a barrier.
 616       //
 617       // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
 618       // but we also want to wait for any other outstanding transfers before
 619       // signalling other hardware blocks
 620       if ((I->getOpcode() == AMDGPU::S_BARRIER &&
 621                ST->needWaitcntBeforeBarrier()) ||
 622            I->getOpcode() == AMDGPU::S_SENDMSG ||
 623            I->getOpcode() == AMDGPU::S_SENDMSGHALT)
 624         Required = LastIssued;
 625       else
 626         Required = handleOperands(*I);
 627
 628       Counters Increment = getHwCounts(*I);
 629
 630       if (countersNonZero(Required) || countersNonZero(Increment))
 631         increaseCounters(Required, DelayedWaitOn);
 632
 633       Changes |= insertWait(MBB, I, Required);
 634
 635       pushInstruction(MBB, I, Increment);
 636       handleSendMsg(MBB, I);
 637
 638       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
 639           I->getOpcode() == AMDGPU::SI_RETURN)
 640         EndPgmBlocks.push_back(&MBB);
 641     }
 642
 643     // Wait for everything at the end of the MBB
 644     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
 645   }
 646
 647   if (HaveScalarStores) {
 648     // If scalar writes are used, the cache must be flushed or else the next
 649     // wave to reuse the same scratch memory can be clobbered.
 650     //
 651     // Insert s_dcache_wb at wave termination points if there were any scalar
 652     // stores, and only if the cache hasn't already been flushed. This could be
 653     // improved by looking across blocks for flushes in postdominating blocks
 654     // from the stores but an explicitly requested flush is probably very rare.
 655     for (MachineBasicBlock *MBB : EndPgmBlocks) {
 656       bool SeenDCacheWB = false;
 657
 658       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
 659            I != E; ++I) {
 660
 661         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
 662           SeenDCacheWB = true;
 663         else if (TII->isScalarStore(*I))
 664           SeenDCacheWB = false;
 665
 666         // FIXME: It would be better to insert this before a waitcnt if any.
 667         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
 668              I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
 669           Changes = true;
 670           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
 671         }
 672       }
 673     }
 674   }
 675
 676   for (MachineInstr *I : RemoveMI)
 677     I->eraseFromParent();
 678
 679   return Changes;
 680 }