1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Insert wait instructions for memory reads and writes.
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
17 //===----------------------------------------------------------------------===//
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #include "llvm/CodeGen/MachineInstr.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineOperand.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/Pass.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/raw_ostream.h"
39 #include "llvm/Target/TargetRegisterInfo.h"
47 #define DEBUG_TYPE "si-insert-waits"
53 /// \brief One variable for each of the hardware counters
69 typedef Counters RegCounters[512];
70 typedef std::pair<unsigned, unsigned> RegInterval;
72 class SIInsertWaits : public MachineFunctionPass {
74 const SISubtarget *ST = nullptr;
75 const SIInstrInfo *TII = nullptr;
76 const SIRegisterInfo *TRI = nullptr;
77 const MachineRegisterInfo *MRI;
78 AMDGPU::IsaInfo::IsaVersion ISA;
80 /// \brief Constant zero value
81 static const Counters ZeroCounts;
83 /// \brief Hardware limits
84 Counters HardwareLimits;
86 /// \brief Counter values we have already waited on.
89 /// \brief Counter values that we must wait on before the next counter
91 Counters DelayedWaitOn;
93 /// \brief Counter values for last instruction issued.
96 /// \brief Registers used by async instructions.
99 /// \brief Registers defined by async instructions.
100 RegCounters DefinedRegs;
102 /// \brief Different export instruction types seen since last wait.
103 unsigned ExpInstrTypesSeen = 0;
105 /// \brief Type of the last opcode.
106 InstType LastOpcodeType;
108 bool LastInstWritesM0;
110 /// Whether or not we have flat operations outstanding.
111 bool IsFlatOutstanding;
113 /// \brief Whether the machine function returns void
116 /// Whether the VCCZ bit is possibly corrupt
117 bool VCCZCorrupt = false;
119 /// \brief Get increment/decrement amount for this instruction.
120 Counters getHwCounts(MachineInstr &MI);
122 /// \brief Is operand relevant for async execution?
123 bool isOpRelevant(MachineOperand &Op);
125 /// \brief Get register interval an operand affects.
126 RegInterval getRegInterval(const TargetRegisterClass *RC,
127 const MachineOperand &Reg) const;
129 /// \brief Handle instructions async components
130 void pushInstruction(MachineBasicBlock &MBB,
131 MachineBasicBlock::iterator I,
132 const Counters& Increment);
134 /// \brief Insert the actual wait instruction
135 bool insertWait(MachineBasicBlock &MBB,
136 MachineBasicBlock::iterator I,
137 const Counters &Counts);
139 /// \brief Handle existing wait instructions (from intrinsics)
140 void handleExistingWait(MachineBasicBlock::iterator I);
142 /// \brief Do we need def2def checks?
143 bool unorderedDefines(MachineInstr &MI);
145 /// \brief Resolve all operand dependencies to counter requirements
146 Counters handleOperands(MachineInstr &MI);
148 /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
149 void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
151 /// Return true if there are LGKM instrucitons that haven't been waited on
153 bool hasOutstandingLGKM() const;
158 SIInsertWaits() : MachineFunctionPass(ID) {}
160 bool runOnMachineFunction(MachineFunction &MF) override;
162 StringRef getPassName() const override {
163 return "SI insert wait instructions";
166 void getAnalysisUsage(AnalysisUsage &AU) const override {
167 AU.setPreservesCFG();
168 MachineFunctionPass::getAnalysisUsage(AU);
172 } // end anonymous namespace
174 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
175 "SI Insert Waits", false, false)
176 INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
177 "SI Insert Waits", false, false)
179 char SIInsertWaits::ID = 0;
181 char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
183 FunctionPass *llvm::createSIInsertWaitsPass() {
184 return new SIInsertWaits();
187 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
189 static bool readsVCCZ(const MachineInstr &MI) {
190 unsigned Opc = MI.getOpcode();
191 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
192 !MI.getOperand(1).isUndef();
195 bool SIInsertWaits::hasOutstandingLGKM() const {
196 return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
199 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
200 uint64_t TSFlags = MI.getDesc().TSFlags;
201 Counters Result = { { 0, 0, 0 } };
203 Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
205 // Only consider stores or EXP for EXP_CNT
206 Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
208 // LGKM may uses larger values
209 if (TSFlags & SIInstrFlags::LGKM_CNT) {
211 if (TII->isSMRD(MI)) {
213 if (MI.getNumOperands() != 0) {
214 assert(MI.getOperand(0).isReg() &&
215 "First LGKM operand must be a register!");
217 // XXX - What if this is a write into a super register?
218 const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
219 unsigned Size = TRI->getRegSizeInBits(*RC);
220 Result.Named.LGKM = Size > 32 ? 2 : 1;
222 // s_dcache_inv etc. do not have a a destination register. Assume we
223 // want a wait on these.
224 // XXX - What is the right value?
225 Result.Named.LGKM = 1;
229 Result.Named.LGKM = 1;
233 Result.Named.LGKM = 0;
239 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
240 // Constants are always irrelevant
241 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
244 // Defines are always relevant
248 // For exports all registers are relevant.
249 // TODO: Skip undef/disabled registers.
250 MachineInstr &MI = *Op.getParent();
254 // For stores the stored value is also relevant
255 if (!MI.getDesc().mayStore())
258 // Check if this operand is the value being stored.
259 // Special case for DS/FLAT instructions, since the address
260 // operand comes before the value operand and it may have
261 // multiple data operands.
264 MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
265 if (Data0 && Op.isIdenticalTo(*Data0))
268 MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
269 return Data1 && Op.isIdenticalTo(*Data1);
272 if (TII->isFLAT(MI)) {
273 MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
274 if (Data && Op.isIdenticalTo(*Data))
278 // NOTE: This assumes that the value operand is before the
279 // address operand, and that there is only one value operand.
280 for (MachineInstr::mop_iterator I = MI.operands_begin(),
281 E = MI.operands_end(); I != E; ++I) {
283 if (I->isReg() && I->isUse())
284 return Op.isIdenticalTo(*I);
290 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
291 const MachineOperand &Reg) const {
292 unsigned Size = TRI->getRegSizeInBits(*RC);
296 Result.first = TRI->getEncodingValue(Reg.getReg());
297 Result.second = Result.first + Size / 32;
302 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
303 MachineBasicBlock::iterator I,
304 const Counters &Increment) {
305 // Get the hardware counter increments and sum them up
306 Counters Limit = ZeroCounts;
309 if (TII->mayAccessFlatAddressSpace(*I))
310 IsFlatOutstanding = true;
312 for (unsigned i = 0; i < 3; ++i) {
313 LastIssued.Array[i] += Increment.Array[i];
314 if (Increment.Array[i])
315 Limit.Array[i] = LastIssued.Array[i];
316 Sum += Increment.Array[i];
319 // If we don't increase anything then that's it
321 LastOpcodeType = OTHER;
325 if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
326 // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
327 // or SMEM clause, respectively.
329 // The temporary workaround is to break the clauses with S_NOP.
331 // The proper solution would be to allocate registers such that all source
332 // and destination registers don't overlap, e.g. this is illegal:
335 if (LastOpcodeType == VMEM && Increment.Named.VM) {
336 // Insert a NOP to break the clause.
337 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
339 LastInstWritesM0 = false;
343 LastOpcodeType = SMEM;
344 else if (Increment.Named.VM)
345 LastOpcodeType = VMEM;
348 // Remember which export instructions we have seen
349 if (Increment.Named.EXP) {
350 ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
353 for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
354 MachineOperand &Op = I->getOperand(i);
355 if (!isOpRelevant(Op))
358 const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
359 RegInterval Interval = getRegInterval(RC, Op);
360 for (unsigned j = Interval.first; j < Interval.second; ++j) {
362 // Remember which registers we define
364 DefinedRegs[j] = Limit;
366 // and which one we are using
373 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
374 MachineBasicBlock::iterator I,
375 const Counters &Required) {
376 // End of program? No need to wait on anything
377 // A function not returning void needs to wait, because other bytecode will
378 // be appended after it and we don't know what it will be.
379 if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
382 // Figure out if the async instructions execute in order
385 // VM_CNT is always ordered except when there are flat instructions, which
386 // can return out of order.
387 Ordered[0] = !IsFlatOutstanding;
389 // EXP_CNT is unordered if we have both EXP & VM-writes
390 Ordered[1] = ExpInstrTypesSeen == 3;
392 // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
395 // The values we are going to put into the S_WAITCNT instruction
396 Counters Counts = HardwareLimits;
398 // Do we really need to wait?
399 bool NeedWait = false;
401 for (unsigned i = 0; i < 3; ++i) {
402 if (Required.Array[i] <= WaitedOn.Array[i])
408 unsigned Value = LastIssued.Array[i] - Required.Array[i];
410 // Adjust the value to the real hardware possibilities.
411 Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
416 // Remember on what we have waited on.
417 WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
423 // Reset EXP_CNT instruction types
424 if (Counts.Named.EXP == 0)
425 ExpInstrTypesSeen = 0;
427 // Build the wait instruction
428 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
429 .addImm(AMDGPU::encodeWaitcnt(ISA,
434 LastOpcodeType = OTHER;
435 LastInstWritesM0 = false;
436 IsFlatOutstanding = false;
440 /// \brief helper function for handleOperands
441 static void increaseCounters(Counters &Dst, const Counters &Src) {
442 for (unsigned i = 0; i < 3; ++i)
443 Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
446 /// \brief check whether any of the counters is non-zero
447 static bool countersNonZero(const Counters &Counter) {
448 for (unsigned i = 0; i < 3; ++i)
449 if (Counter.Array[i])
454 void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
455 assert(I->getOpcode() == AMDGPU::S_WAITCNT);
457 unsigned Imm = I->getOperand(0).getImm();
458 Counters Counts, WaitOn;
460 Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
461 Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
462 Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
464 for (unsigned i = 0; i < 3; ++i) {
465 if (Counts.Array[i] <= LastIssued.Array[i])
466 WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
471 increaseCounters(DelayedWaitOn, WaitOn);
474 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
475 Counters Result = ZeroCounts;
477 // For each register affected by this instruction increase the result
480 // TODO: We could probably just look at explicit operands if we removed VCC /
481 // EXEC from SMRD dest reg classes.
482 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
483 MachineOperand &Op = MI.getOperand(i);
484 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
487 const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
488 RegInterval Interval = getRegInterval(RC, Op);
489 for (unsigned j = Interval.first; j < Interval.second; ++j) {
491 increaseCounters(Result, UsedRegs[j]);
492 increaseCounters(Result, DefinedRegs[j]);
496 increaseCounters(Result, DefinedRegs[j]);
503 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
504 MachineBasicBlock::iterator I) {
505 if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
508 // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
509 if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
510 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
511 LastInstWritesM0 = false;
515 // Set whether this instruction sets M0
516 LastInstWritesM0 = false;
518 unsigned NumOperands = I->getNumOperands();
519 for (unsigned i = 0; i < NumOperands; i++) {
520 const MachineOperand &Op = I->getOperand(i);
522 if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
523 LastInstWritesM0 = true;
527 /// Return true if \p MBB has one successor immediately following, and is its
529 static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
530 if (MBB.succ_size() != 1)
533 const MachineBasicBlock *Succ = *MBB.succ_begin();
534 return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
537 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
538 // around other non-memory instructions.
539 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
540 bool Changes = false;
542 ST = &MF.getSubtarget<SISubtarget>();
543 TII = ST->getInstrInfo();
544 TRI = &TII->getRegisterInfo();
545 MRI = &MF.getRegInfo();
546 ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
547 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
549 HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
550 HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
551 HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
553 WaitedOn = ZeroCounts;
554 DelayedWaitOn = ZeroCounts;
555 LastIssued = ZeroCounts;
556 LastOpcodeType = OTHER;
557 LastInstWritesM0 = false;
558 IsFlatOutstanding = false;
559 ReturnsVoid = MFI->returnsVoid();
561 memset(&UsedRegs, 0, sizeof(UsedRegs));
562 memset(&DefinedRegs, 0, sizeof(DefinedRegs));
564 SmallVector<MachineInstr *, 4> RemoveMI;
565 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
567 bool HaveScalarStores = false;
569 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
572 MachineBasicBlock &MBB = *BI;
574 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
577 if (!HaveScalarStores && TII->isScalarStore(*I))
578 HaveScalarStores = true;
580 if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
581 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
582 // vccz bit, so when we detect that an instruction may read from a
583 // corrupt vccz bit, we need to:
584 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
586 // 2. Restore the correct value of vccz by writing the current value
587 // of vcc back to vcc.
589 if (TII->isSMRD(I->getOpcode())) {
591 } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
592 // FIXME: We only care about SMRD instructions here, not LDS or GDS.
593 // Whenever we store a value in vcc, the correct value of vccz is
598 // Check if we need to apply the bug work-around
599 if (VCCZCorrupt && readsVCCZ(*I)) {
600 DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
602 // Wait on everything, not just LGKM. vccz reads usually come from
603 // terminators, and we always wait on everything at the end of the
604 // block, so if we only wait on LGKM here, we might end up with
605 // another s_waitcnt inserted right after this if there are non-LGKM
606 // instructions still outstanding.
607 insertWait(MBB, I, LastIssued);
609 // Restore the vccz bit. Any time a value is written to vcc, the vcc
610 // bit is updated, so we can restore the bit by reading the value of
611 // vcc and then writing it back to the register.
612 BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
614 .addReg(AMDGPU::VCC);
618 // Record pre-existing, explicitly requested waits
619 if (I->getOpcode() == AMDGPU::S_WAITCNT) {
620 handleExistingWait(*I);
621 RemoveMI.push_back(&*I);
627 // Wait for everything before a barrier.
629 // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
630 // but we also want to wait for any other outstanding transfers before
631 // signalling other hardware blocks
632 if ((I->getOpcode() == AMDGPU::S_BARRIER &&
633 !ST->hasAutoWaitcntBeforeBarrier()) ||
634 I->getOpcode() == AMDGPU::S_SENDMSG ||
635 I->getOpcode() == AMDGPU::S_SENDMSGHALT)
636 Required = LastIssued;
638 Required = handleOperands(*I);
640 Counters Increment = getHwCounts(*I);
642 if (countersNonZero(Required) || countersNonZero(Increment))
643 increaseCounters(Required, DelayedWaitOn);
645 Changes |= insertWait(MBB, I, Required);
647 pushInstruction(MBB, I, Increment);
648 handleSendMsg(MBB, I);
650 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
651 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
652 EndPgmBlocks.push_back(&MBB);
655 // Wait for everything at the end of the MBB. If there is only one
656 // successor, we can defer this until the uses there.
657 if (!hasTrivialSuccessor(MBB))
658 Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
661 if (HaveScalarStores) {
662 // If scalar writes are used, the cache must be flushed or else the next
663 // wave to reuse the same scratch memory can be clobbered.
665 // Insert s_dcache_wb at wave termination points if there were any scalar
666 // stores, and only if the cache hasn't already been flushed. This could be
667 // improved by looking across blocks for flushes in postdominating blocks
668 // from the stores but an explicitly requested flush is probably very rare.
669 for (MachineBasicBlock *MBB : EndPgmBlocks) {
670 bool SeenDCacheWB = false;
672 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
675 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
677 else if (TII->isScalarStore(*I))
678 SeenDCacheWB = false;
680 // FIXME: It would be better to insert this before a waitcnt if any.
681 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
682 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
684 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
690 for (MachineInstr *I : RemoveMI)
691 I->eraseFromParent();
693 if (!MFI->isEntryFunction()) {
694 // Wait for any outstanding memory operations that the input registers may
695 // depend on. We can't track them and it's better to to the wait after the
696 // costly call sequence.
698 // TODO: Could insert earlier and schedule more liberally with operations
699 // that only use caller preserved registers.
700 MachineBasicBlock &EntryBB = MF.front();
701 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))