contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp

   1 //===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Insert wait instructions for memory reads and writes.
  12 ///
  13 /// Memory reads and writes are issued asynchronously, so we need to insert
  14 /// S_WAITCNT instructions when we want to access any of their results or
  15 /// overwrite any register that's used asynchronously.
  16 //
  17 //===----------------------------------------------------------------------===//
  18
  19 #include "AMDGPU.h"
  20 #include "AMDGPUSubtarget.h"
  21 #include "SIDefines.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIMachineFunctionInfo.h"
  24 #include "SIRegisterInfo.h"
  25 #include "Utils/AMDGPUBaseInfo.h"
  26 #include "llvm/ADT/SmallVector.h"
  27 #include "llvm/ADT/StringRef.h"
  28 #include "llvm/CodeGen/MachineBasicBlock.h"
  29 #include "llvm/CodeGen/MachineFunction.h"
  30 #include "llvm/CodeGen/MachineFunctionPass.h"
  31 #include "llvm/CodeGen/MachineInstr.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineOperand.h"
  34 #include "llvm/CodeGen/MachineRegisterInfo.h"
  35 #include "llvm/IR/DebugLoc.h"
  36 #include "llvm/MC/MCInstrDesc.h"
  37 #include "llvm/Pass.h"
  38 #include "llvm/Support/Debug.h"
  39 #include "llvm/Support/raw_ostream.h"
  40 #include <algorithm>
  41 #include <cassert>
  42 #include <cstdint>
  43 #include <cstring>
  44 #include <utility>
  45
  46 #define DEBUG_TYPE "si-insert-waits"
  47
  48 using namespace llvm;
  49
  50 namespace {
  51
  52 /// \brief One variable for each of the hardware counters
  53 using Counters = union {
  54   struct {
  55     unsigned VM;
  56     unsigned EXP;
  57     unsigned LGKM;
  58   } Named;
  59   unsigned Array[3];
  60 };
  61
  62 using InstType = enum {
  63   OTHER,
  64   SMEM,
  65   VMEM
  66 };
  67
  68 using RegCounters =  Counters[512];
  69 using RegInterval = std::pair<unsigned, unsigned>;
  70
  71 class SIInsertWaits : public MachineFunctionPass {
  72 private:
  73   const SISubtarget *ST = nullptr;
  74   const SIInstrInfo *TII = nullptr;
  75   const SIRegisterInfo *TRI = nullptr;
  76   const MachineRegisterInfo *MRI;
  77   AMDGPU::IsaInfo::IsaVersion ISA;
  78
  79   /// \brief Constant zero value
  80   static const Counters ZeroCounts;
  81
  82   /// \brief Hardware limits
  83   Counters HardwareLimits;
  84
  85   /// \brief Counter values we have already waited on.
  86   Counters WaitedOn;
  87
  88   /// \brief Counter values that we must wait on before the next counter
  89   /// increase.
  90   Counters DelayedWaitOn;
  91
  92   /// \brief Counter values for last instruction issued.
  93   Counters LastIssued;
  94
  95   /// \brief Registers used by async instructions.
  96   RegCounters UsedRegs;
  97
  98   /// \brief Registers defined by async instructions.
  99   RegCounters DefinedRegs;
 100
 101   /// \brief Different export instruction types seen since last wait.
 102   unsigned ExpInstrTypesSeen = 0;
 103
 104   /// \brief Type of the last opcode.
 105   InstType LastOpcodeType;
 106
 107   bool LastInstWritesM0;
 108
 109   /// Whether or not we have flat operations outstanding.
 110   bool IsFlatOutstanding;
 111
 112   /// \brief Whether the machine function returns void
 113   bool ReturnsVoid;
 114
 115   /// Whether the VCCZ bit is possibly corrupt
 116   bool VCCZCorrupt = false;
 117
 118   /// \brief Get increment/decrement amount for this instruction.
 119   Counters getHwCounts(MachineInstr &MI);
 120
 121   /// \brief Is operand relevant for async execution?
 122   bool isOpRelevant(MachineOperand &Op);
 123
 124   /// \brief Get register interval an operand affects.
 125   RegInterval getRegInterval(const TargetRegisterClass *RC,
 126                              const MachineOperand &Reg) const;
 127
 128   /// \brief Handle instructions async components
 129   void pushInstruction(MachineBasicBlock &MBB,
 130                        MachineBasicBlock::iterator I,
 131                        const Counters& Increment);
 132
 133   /// \brief Insert the actual wait instruction
 134   bool insertWait(MachineBasicBlock &MBB,
 135                   MachineBasicBlock::iterator I,
 136                   const Counters &Counts);
 137
 138   /// \brief Handle existing wait instructions (from intrinsics)
 139   void handleExistingWait(MachineBasicBlock::iterator I);
 140
 141   /// \brief Do we need def2def checks?
 142   bool unorderedDefines(MachineInstr &MI);
 143
 144   /// \brief Resolve all operand dependencies to counter requirements
 145   Counters handleOperands(MachineInstr &MI);
 146
 147   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
 148   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 149
 150   /// Return true if there are LGKM instrucitons that haven't been waited on
 151   /// yet.
 152   bool hasOutstandingLGKM() const;
 153
 154 public:
 155   static char ID;
 156
 157   SIInsertWaits() : MachineFunctionPass(ID) {}
 158
 159   bool runOnMachineFunction(MachineFunction &MF) override;
 160
 161   StringRef getPassName() const override {
 162     return "SI insert wait instructions";
 163   }
 164
 165   void getAnalysisUsage(AnalysisUsage &AU) const override {
 166     AU.setPreservesCFG();
 167     MachineFunctionPass::getAnalysisUsage(AU);
 168   }
 169 };
 170
 171 } // end anonymous namespace
 172
 173 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
 174                       "SI Insert Waits", false, false)
 175 INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
 176                     "SI Insert Waits", false, false)
 177
 178 char SIInsertWaits::ID = 0;
 179
 180 char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
 181
 182 FunctionPass *llvm::createSIInsertWaitsPass() {
 183   return new SIInsertWaits();
 184 }
 185
 186 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
 187
 188 static bool readsVCCZ(const MachineInstr &MI) {
 189   unsigned Opc = MI.getOpcode();
 190   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
 191          !MI.getOperand(1).isUndef();
 192 }
 193
 194 bool SIInsertWaits::hasOutstandingLGKM() const {
 195   return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
 196 }
 197
 198 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 199   uint64_t TSFlags = MI.getDesc().TSFlags;
 200   Counters Result = { { 0, 0, 0 } };
 201
 202   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
 203
 204   // Only consider stores or EXP for EXP_CNT
 205   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
 206
 207   // LGKM may uses larger values
 208   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 209
 210     if (TII->isSMRD(MI)) {
 211
 212       if (MI.getNumOperands() != 0) {
 213         assert(MI.getOperand(0).isReg() &&
 214                "First LGKM operand must be a register!");
 215
 216         // XXX - What if this is a write into a super register?
 217         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
 218         unsigned Size = TRI->getRegSizeInBits(*RC);
 219         Result.Named.LGKM = Size > 32 ? 2 : 1;
 220       } else {
 221         // s_dcache_inv etc. do not have a a destination register. Assume we
 222         // want a wait on these.
 223         // XXX - What is the right value?
 224         Result.Named.LGKM = 1;
 225       }
 226     } else {
 227       // DS
 228       Result.Named.LGKM = 1;
 229     }
 230
 231   } else {
 232     Result.Named.LGKM = 0;
 233   }
 234
 235   return Result;
 236 }
 237
 238 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 239   // Constants are always irrelevant
 240   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 241     return false;
 242
 243   // Defines are always relevant
 244   if (Op.isDef())
 245     return true;
 246
 247   // For exports all registers are relevant.
 248   // TODO: Skip undef/disabled registers.
 249   MachineInstr &MI = *Op.getParent();
 250   if (TII->isEXP(MI))
 251     return true;
 252
 253   // For stores the stored value is also relevant
 254   if (!MI.getDesc().mayStore())
 255     return false;
 256
 257   // Check if this operand is the value being stored.
 258   // Special case for DS/FLAT instructions, since the address
 259   // operand comes before the value operand and it may have
 260   // multiple data operands.
 261
 262   if (TII->isDS(MI)) {
 263     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
 264     if (Data0 && Op.isIdenticalTo(*Data0))
 265       return true;
 266
 267     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
 268     return Data1 && Op.isIdenticalTo(*Data1);
 269   }
 270
 271   if (TII->isFLAT(MI)) {
 272     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
 273     if (Data && Op.isIdenticalTo(*Data))
 274       return true;
 275   }
 276
 277   // NOTE: This assumes that the value operand is before the
 278   // address operand, and that there is only one value operand.
 279   for (MachineInstr::mop_iterator I = MI.operands_begin(),
 280        E = MI.operands_end(); I != E; ++I) {
 281
 282     if (I->isReg() && I->isUse())
 283       return Op.isIdenticalTo(*I);
 284   }
 285
 286   return false;
 287 }
 288
 289 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 290                                           const MachineOperand &Reg) const {
 291   unsigned Size = TRI->getRegSizeInBits(*RC);
 292   assert(Size >= 32);
 293
 294   RegInterval Result;
 295   Result.first = TRI->getEncodingValue(Reg.getReg());
 296   Result.second = Result.first + Size / 32;
 297
 298   return Result;
 299 }
 300
 301 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 302                                     MachineBasicBlock::iterator I,
 303                                     const Counters &Increment) {
 304   // Get the hardware counter increments and sum them up
 305   Counters Limit = ZeroCounts;
 306   unsigned Sum = 0;
 307
 308   if (TII->mayAccessFlatAddressSpace(*I))
 309     IsFlatOutstanding = true;
 310
 311   for (unsigned i = 0; i < 3; ++i) {
 312     LastIssued.Array[i] += Increment.Array[i];
 313     if (Increment.Array[i])
 314       Limit.Array[i] = LastIssued.Array[i];
 315     Sum += Increment.Array[i];
 316   }
 317
 318   // If we don't increase anything then that's it
 319   if (Sum == 0) {
 320     LastOpcodeType = OTHER;
 321     return;
 322   }
 323
 324   if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
 325     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
 326     // or SMEM clause, respectively.
 327     //
 328     // The temporary workaround is to break the clauses with S_NOP.
 329     //
 330     // The proper solution would be to allocate registers such that all source
 331     // and destination registers don't overlap, e.g. this is illegal:
 332     //   r0 = load r2
 333     //   r2 = load r0
 334     if (LastOpcodeType == VMEM && Increment.Named.VM) {
 335       // Insert a NOP to break the clause.
 336       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
 337           .addImm(0);
 338       LastInstWritesM0 = false;
 339     }
 340
 341     if (TII->isSMRD(*I))
 342       LastOpcodeType = SMEM;
 343     else if (Increment.Named.VM)
 344       LastOpcodeType = VMEM;
 345   }
 346
 347   // Remember which export instructions we have seen
 348   if (Increment.Named.EXP) {
 349     ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
 350   }
 351
 352   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
 353     MachineOperand &Op = I->getOperand(i);
 354     if (!isOpRelevant(Op))
 355       continue;
 356
 357     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
 358     RegInterval Interval = getRegInterval(RC, Op);
 359     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 360
 361       // Remember which registers we define
 362       if (Op.isDef())
 363         DefinedRegs[j] = Limit;
 364
 365       // and which one we are using
 366       if (Op.isUse())
 367         UsedRegs[j] = Limit;
 368     }
 369   }
 370 }
 371
 372 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 373                                MachineBasicBlock::iterator I,
 374                                const Counters &Required) {
 375   // End of program? No need to wait on anything
 376   // A function not returning void needs to wait, because other bytecode will
 377   // be appended after it and we don't know what it will be.
 378   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
 379     return false;
 380
 381   // Figure out if the async instructions execute in order
 382   bool Ordered[3];
 383
 384   // VM_CNT is always ordered except when there are flat instructions, which
 385   // can return out of order.
 386   Ordered[0] = !IsFlatOutstanding;
 387
 388   // EXP_CNT is unordered if we have both EXP & VM-writes
 389   Ordered[1] = ExpInstrTypesSeen == 3;
 390
 391   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
 392   Ordered[2] = false;
 393
 394   // The values we are going to put into the S_WAITCNT instruction
 395   Counters Counts = HardwareLimits;
 396
 397   // Do we really need to wait?
 398   bool NeedWait = false;
 399
 400   for (unsigned i = 0; i < 3; ++i) {
 401     if (Required.Array[i] <= WaitedOn.Array[i])
 402       continue;
 403
 404     NeedWait = true;
 405
 406     if (Ordered[i]) {
 407       unsigned Value = LastIssued.Array[i] - Required.Array[i];
 408
 409       // Adjust the value to the real hardware possibilities.
 410       Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
 411     } else
 412       Counts.Array[i] = 0;
 413
 414     // Remember on what we have waited on.
 415     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
 416   }
 417
 418   if (!NeedWait)
 419     return false;
 420
 421   // Reset EXP_CNT instruction types
 422   if (Counts.Named.EXP == 0)
 423     ExpInstrTypesSeen = 0;
 424
 425   // Build the wait instruction
 426   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
 427     .addImm(AMDGPU::encodeWaitcnt(ISA,
 428                                   Counts.Named.VM,
 429                                   Counts.Named.EXP,
 430                                   Counts.Named.LGKM));
 431
 432   LastOpcodeType = OTHER;
 433   LastInstWritesM0 = false;
 434   IsFlatOutstanding = false;
 435   return true;
 436 }
 437
 438 /// \brief helper function for handleOperands
 439 static void increaseCounters(Counters &Dst, const Counters &Src) {
 440   for (unsigned i = 0; i < 3; ++i)
 441     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 442 }
 443
 444 /// \brief check whether any of the counters is non-zero
 445 static bool countersNonZero(const Counters &Counter) {
 446   for (unsigned i = 0; i < 3; ++i)
 447     if (Counter.Array[i])
 448       return true;
 449   return false;
 450 }
 451
 452 void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
 453   assert(I->getOpcode() == AMDGPU::S_WAITCNT);
 454
 455   unsigned Imm = I->getOperand(0).getImm();
 456   Counters Counts, WaitOn;
 457
 458   Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
 459   Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
 460   Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
 461
 462   for (unsigned i = 0; i < 3; ++i) {
 463     if (Counts.Array[i] <= LastIssued.Array[i])
 464       WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
 465     else
 466       WaitOn.Array[i] = 0;
 467   }
 468
 469   increaseCounters(DelayedWaitOn, WaitOn);
 470 }
 471
 472 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 473   Counters Result = ZeroCounts;
 474
 475   // For each register affected by this instruction increase the result
 476   // sequence.
 477   //
 478   // TODO: We could probably just look at explicit operands if we removed VCC /
 479   // EXEC from SMRD dest reg classes.
 480   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
 481     MachineOperand &Op = MI.getOperand(i);
 482     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 483       continue;
 484
 485     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
 486     RegInterval Interval = getRegInterval(RC, Op);
 487     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 488       if (Op.isDef()) {
 489         increaseCounters(Result, UsedRegs[j]);
 490         increaseCounters(Result, DefinedRegs[j]);
 491       }
 492
 493       if (Op.isUse())
 494         increaseCounters(Result, DefinedRegs[j]);
 495     }
 496   }
 497
 498   return Result;
 499 }
 500
 501 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
 502                                   MachineBasicBlock::iterator I) {
 503   if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
 504     return;
 505
 506   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
 507   if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
 508     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
 509     LastInstWritesM0 = false;
 510     return;
 511   }
 512
 513   // Set whether this instruction sets M0
 514   LastInstWritesM0 = false;
 515
 516   unsigned NumOperands = I->getNumOperands();
 517   for (unsigned i = 0; i < NumOperands; i++) {
 518     const MachineOperand &Op = I->getOperand(i);
 519
 520     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
 521       LastInstWritesM0 = true;
 522   }
 523 }
 524
 525 /// Return true if \p MBB has one successor immediately following, and is its
 526 /// only predecessor
 527 static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
 528   if (MBB.succ_size() != 1)
 529     return false;
 530
 531   const MachineBasicBlock *Succ = *MBB.succ_begin();
 532   return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
 533 }
 534
 535 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 536 // around other non-memory instructions.
 537 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 538   bool Changes = false;
 539
 540   ST = &MF.getSubtarget<SISubtarget>();
 541   TII = ST->getInstrInfo();
 542   TRI = &TII->getRegisterInfo();
 543   MRI = &MF.getRegInfo();
 544   ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
 545   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 546
 547   HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
 548   HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
 549   HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
 550
 551   WaitedOn = ZeroCounts;
 552   DelayedWaitOn = ZeroCounts;
 553   LastIssued = ZeroCounts;
 554   LastOpcodeType = OTHER;
 555   LastInstWritesM0 = false;
 556   IsFlatOutstanding = false;
 557   ReturnsVoid = MFI->returnsVoid();
 558
 559   memset(&UsedRegs, 0, sizeof(UsedRegs));
 560   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 561
 562   SmallVector<MachineInstr *, 4> RemoveMI;
 563   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
 564
 565   bool HaveScalarStores = false;
 566
 567   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 568        BI != BE; ++BI) {
 569     MachineBasicBlock &MBB = *BI;
 570
 571     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 572          I != E; ++I) {
 573       if (!HaveScalarStores && TII->isScalarStore(*I))
 574         HaveScalarStores = true;
 575
 576       if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
 577         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
 578         // vccz bit, so when we detect that an instruction may read from a
 579         // corrupt vccz bit, we need to:
 580         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
 581         //    complete.
 582         // 2. Restore the correct value of vccz by writing the current value
 583         //    of vcc back to vcc.
 584
 585         if (TII->isSMRD(I->getOpcode())) {
 586           VCCZCorrupt = true;
 587         } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
 588           // FIXME: We only care about SMRD instructions here, not LDS or GDS.
 589           // Whenever we store a value in vcc, the correct value of vccz is
 590           // restored.
 591           VCCZCorrupt = false;
 592         }
 593
 594         // Check if we need to apply the bug work-around
 595         if (VCCZCorrupt && readsVCCZ(*I)) {
 596           DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
 597
 598           // Wait on everything, not just LGKM.  vccz reads usually come from
 599           // terminators, and we always wait on everything at the end of the
 600           // block, so if we only wait on LGKM here, we might end up with
 601           // another s_waitcnt inserted right after this if there are non-LGKM
 602           // instructions still outstanding.
 603           insertWait(MBB, I, LastIssued);
 604
 605           // Restore the vccz bit.  Any time a value is written to vcc, the vcc
 606           // bit is updated, so we can restore the bit by reading the value of
 607           // vcc and then writing it back to the register.
 608           BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
 609                   AMDGPU::VCC)
 610             .addReg(AMDGPU::VCC);
 611         }
 612       }
 613
 614       // Record pre-existing, explicitly requested waits
 615       if (I->getOpcode() == AMDGPU::S_WAITCNT) {
 616         handleExistingWait(*I);
 617         RemoveMI.push_back(&*I);
 618         continue;
 619       }
 620
 621       Counters Required;
 622
 623       // Wait for everything before a barrier.
 624       //
 625       // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
 626       // but we also want to wait for any other outstanding transfers before
 627       // signalling other hardware blocks
 628       if ((I->getOpcode() == AMDGPU::S_BARRIER &&
 629                !ST->hasAutoWaitcntBeforeBarrier()) ||
 630            I->getOpcode() == AMDGPU::S_SENDMSG ||
 631            I->getOpcode() == AMDGPU::S_SENDMSGHALT)
 632         Required = LastIssued;
 633       else
 634         Required = handleOperands(*I);
 635
 636       Counters Increment = getHwCounts(*I);
 637
 638       if (countersNonZero(Required) || countersNonZero(Increment))
 639         increaseCounters(Required, DelayedWaitOn);
 640
 641       Changes |= insertWait(MBB, I, Required);
 642
 643       pushInstruction(MBB, I, Increment);
 644       handleSendMsg(MBB, I);
 645
 646       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
 647           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
 648         EndPgmBlocks.push_back(&MBB);
 649     }
 650
 651     // Wait for everything at the end of the MBB. If there is only one
 652     // successor, we can defer this until the uses there.
 653     if (!hasTrivialSuccessor(MBB))
 654       Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
 655   }
 656
 657   if (HaveScalarStores) {
 658     // If scalar writes are used, the cache must be flushed or else the next
 659     // wave to reuse the same scratch memory can be clobbered.
 660     //
 661     // Insert s_dcache_wb at wave termination points if there were any scalar
 662     // stores, and only if the cache hasn't already been flushed. This could be
 663     // improved by looking across blocks for flushes in postdominating blocks
 664     // from the stores but an explicitly requested flush is probably very rare.
 665     for (MachineBasicBlock *MBB : EndPgmBlocks) {
 666       bool SeenDCacheWB = false;
 667
 668       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
 669            I != E; ++I) {
 670         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
 671           SeenDCacheWB = true;
 672         else if (TII->isScalarStore(*I))
 673           SeenDCacheWB = false;
 674
 675         // FIXME: It would be better to insert this before a waitcnt if any.
 676         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
 677              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
 678           Changes = true;
 679           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
 680         }
 681       }
 682     }
 683   }
 684
 685   for (MachineInstr *I : RemoveMI)
 686     I->eraseFromParent();
 687
 688   if (!MFI->isEntryFunction()) {
 689     // Wait for any outstanding memory operations that the input registers may
 690     // depend on. We can't track them and it's better to to the wait after the
 691     // costly call sequence.
 692
 693     // TODO: Could insert earlier and schedule more liberally with operations
 694     // that only use caller preserved registers.
 695     MachineBasicBlock &EntryBB = MF.front();
 696     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
 697       .addImm(0);
 698
 699     Changes = true;
 700   }
 701
 702   return Changes;
 703 }