contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

   1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements hazard recognizers for scheduling on GCN processors.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "GCNHazardRecognizer.h"
  14 #include "AMDGPUSubtarget.h"
  15 #include "SIDefines.h"
  16 #include "SIInstrInfo.h"
  17 #include "SIRegisterInfo.h"
  18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  19 #include "Utils/AMDGPUBaseInfo.h"
  20 #include "llvm/ADT/iterator_range.h"
  21 #include "llvm/CodeGen/MachineFunction.h"
  22 #include "llvm/CodeGen/MachineInstr.h"
  23 #include "llvm/CodeGen/MachineInstrBuilder.h"
  24 #include "llvm/CodeGen/MachineOperand.h"
  25 #include "llvm/CodeGen/ScheduleDAG.h"
  26 #include "llvm/MC/MCInstrDesc.h"
  27 #include "llvm/Support/ErrorHandling.h"
  28 #include <algorithm>
  29 #include <cassert>
  30 #include <limits>
  31 #include <set>
  32 #include <vector>
  33
  34 using namespace llvm;
  35
  36 //===----------------------------------------------------------------------===//
  37 // Hazard Recoginizer Implementation
  38 //===----------------------------------------------------------------------===//
  39
  40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
  41   IsHazardRecognizerMode(false),
  42   CurrCycleInstr(nullptr),
  43   MF(MF),
  44   ST(MF.getSubtarget<GCNSubtarget>()),
  45   TII(*ST.getInstrInfo()),
  46   TRI(TII.getRegisterInfo()),
  47   ClauseUses(TRI.getNumRegUnits()),
  48   ClauseDefs(TRI.getNumRegUnits()) {
  49   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
  50   TSchedModel.init(&ST);
  51 }
  52
  53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
  54   EmitInstruction(SU->getInstr());
  55 }
  56
  57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
  58   CurrCycleInstr = MI;
  59 }
  60
  61 static bool isDivFMas(unsigned Opcode) {
  62   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
  63 }
  64
  65 static bool isSGetReg(unsigned Opcode) {
  66   return Opcode == AMDGPU::S_GETREG_B32;
  67 }
  68
  69 static bool isSSetReg(unsigned Opcode) {
  70   return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
  71 }
  72
  73 static bool isRWLane(unsigned Opcode) {
  74   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
  75 }
  76
  77 static bool isRFE(unsigned Opcode) {
  78   return Opcode == AMDGPU::S_RFE_B64;
  79 }
  80
  81 static bool isSMovRel(unsigned Opcode) {
  82   switch (Opcode) {
  83   case AMDGPU::S_MOVRELS_B32:
  84   case AMDGPU::S_MOVRELS_B64:
  85   case AMDGPU::S_MOVRELD_B32:
  86   case AMDGPU::S_MOVRELD_B64:
  87     return true;
  88   default:
  89     return false;
  90   }
  91 }
  92
  93 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
  94                                     const MachineInstr &MI) {
  95   if (TII.isAlwaysGDS(MI.getOpcode()))
  96     return true;
  97
  98   switch (MI.getOpcode()) {
  99   case AMDGPU::S_SENDMSG:
 100   case AMDGPU::S_SENDMSGHALT:
 101   case AMDGPU::S_TTRACEDATA:
 102     return true;
 103   // These DS opcodes don't support GDS.
 104   case AMDGPU::DS_NOP:
 105   case AMDGPU::DS_PERMUTE_B32:
 106   case AMDGPU::DS_BPERMUTE_B32:
 107     return false;
 108   default:
 109     if (TII.isDS(MI.getOpcode())) {
 110       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
 111                                            AMDGPU::OpName::gds);
 112       if (MI.getOperand(GDS).getImm())
 113         return true;
 114     }
 115     return false;
 116   }
 117 }
 118
 119 static bool isPermlane(const MachineInstr &MI) {
 120   unsigned Opcode = MI.getOpcode();
 121   return Opcode == AMDGPU::V_PERMLANE16_B32 ||
 122          Opcode == AMDGPU::V_PERMLANEX16_B32;
 123 }
 124
 125 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
 126   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
 127                                                      AMDGPU::OpName::simm16);
 128   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
 129 }
 130
 131 ScheduleHazardRecognizer::HazardType
 132 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 133   MachineInstr *MI = SU->getInstr();
 134   if (MI->isBundle())
 135    return NoHazard;
 136
 137   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
 138     return NoopHazard;
 139
 140   // FIXME: Should flat be considered vmem?
 141   if ((SIInstrInfo::isVMEM(*MI) ||
 142        SIInstrInfo::isFLAT(*MI))
 143       && checkVMEMHazards(MI) > 0)
 144     return NoopHazard;
 145
 146   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
 147     return NoopHazard;
 148
 149   if (checkFPAtomicToDenormModeHazard(MI) > 0)
 150     return NoopHazard;
 151
 152   if (ST.hasNoDataDepHazard())
 153     return NoHazard;
 154
 155   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
 156     return NoopHazard;
 157
 158   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
 159     return NoopHazard;
 160
 161   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
 162     return NoopHazard;
 163
 164   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
 165     return NoopHazard;
 166
 167   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
 168     return NoopHazard;
 169
 170   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
 171     return NoopHazard;
 172
 173   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
 174     return NoopHazard;
 175
 176   if (ST.hasReadM0MovRelInterpHazard() &&
 177       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
 178       checkReadM0Hazards(MI) > 0)
 179     return NoopHazard;
 180
 181   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
 182       checkReadM0Hazards(MI) > 0)
 183     return NoopHazard;
 184
 185   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
 186     return NoopHazard;
 187
 188   if (MI->mayLoadOrStore() && checkMAILdStHazards(MI) > 0)
 189     return NoopHazard;
 190
 191   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
 192     return NoopHazard;
 193
 194   if (checkAnyInstHazards(MI) > 0)
 195     return NoopHazard;
 196
 197   return NoHazard;
 198 }
 199
 200 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
 201   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
 202       .addImm(0);
 203 }
 204
 205 void GCNHazardRecognizer::processBundle() {
 206   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
 207   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
 208   // Check bundled MachineInstr's for hazards.
 209   for (; MI != E && MI->isInsideBundle(); ++MI) {
 210     CurrCycleInstr = &*MI;
 211     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
 212
 213     if (IsHazardRecognizerMode)
 214       fixHazards(CurrCycleInstr);
 215
 216     for (unsigned i = 0; i < WaitStates; ++i)
 217       insertNoopInBundle(CurrCycleInstr, TII);
 218
 219     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
 220     // include the bundled MI directly after, only add a maximum of
 221     // (MaxLookAhead - 1) noops to EmittedInstrs.
 222     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
 223       EmittedInstrs.push_front(nullptr);
 224
 225     EmittedInstrs.push_front(CurrCycleInstr);
 226     EmittedInstrs.resize(MaxLookAhead);
 227   }
 228   CurrCycleInstr = nullptr;
 229 }
 230
 231 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
 232   IsHazardRecognizerMode = true;
 233   CurrCycleInstr = MI;
 234   unsigned W = PreEmitNoopsCommon(MI);
 235   fixHazards(MI);
 236   CurrCycleInstr = nullptr;
 237   return W;
 238 }
 239
 240 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
 241   if (MI->isBundle())
 242     return 0;
 243
 244   int WaitStates = std::max(0, checkAnyInstHazards(MI));
 245
 246   if (SIInstrInfo::isSMRD(*MI))
 247     return std::max(WaitStates, checkSMRDHazards(MI));
 248
 249   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
 250     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
 251
 252   if (ST.hasNSAtoVMEMBug())
 253     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
 254
 255   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
 256
 257   if (ST.hasNoDataDepHazard())
 258     return WaitStates;
 259
 260   if (SIInstrInfo::isVALU(*MI))
 261     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
 262
 263   if (SIInstrInfo::isDPP(*MI))
 264     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
 265
 266   if (isDivFMas(MI->getOpcode()))
 267     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
 268
 269   if (isRWLane(MI->getOpcode()))
 270     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 271
 272   if (MI->isInlineAsm())
 273     return std::max(WaitStates, checkInlineAsmHazards(MI));
 274
 275   if (isSGetReg(MI->getOpcode()))
 276     return std::max(WaitStates, checkGetRegHazards(MI));
 277
 278   if (isSSetReg(MI->getOpcode()))
 279     return std::max(WaitStates, checkSetRegHazards(MI));
 280
 281   if (isRFE(MI->getOpcode()))
 282     return std::max(WaitStates, checkRFEHazards(MI));
 283
 284   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
 285                                            isSMovRel(MI->getOpcode())))
 286     return std::max(WaitStates, checkReadM0Hazards(MI));
 287
 288   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
 289     return std::max(WaitStates, checkReadM0Hazards(MI));
 290
 291   if (SIInstrInfo::isMAI(*MI))
 292     return std::max(WaitStates, checkMAIHazards(MI));
 293
 294   if (MI->mayLoadOrStore())
 295     return std::max(WaitStates, checkMAILdStHazards(MI));
 296
 297   return WaitStates;
 298 }
 299
 300 void GCNHazardRecognizer::EmitNoop() {
 301   EmittedInstrs.push_front(nullptr);
 302 }
 303
 304 void GCNHazardRecognizer::AdvanceCycle() {
 305   // When the scheduler detects a stall, it will call AdvanceCycle() without
 306   // emitting any instructions.
 307   if (!CurrCycleInstr)
 308     return;
 309
 310   // Do not track non-instructions which do not affect the wait states.
 311   // If included, these instructions can lead to buffer overflow such that
 312   // detectable hazards are missed.
 313   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
 314       CurrCycleInstr->isKill())
 315     return;
 316
 317   if (CurrCycleInstr->isBundle()) {
 318     processBundle();
 319     return;
 320   }
 321
 322   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
 323
 324   // Keep track of emitted instructions
 325   EmittedInstrs.push_front(CurrCycleInstr);
 326
 327   // Add a nullptr for each additional wait state after the first.  Make sure
 328   // not to add more than getMaxLookAhead() items to the list, since we
 329   // truncate the list to that size right after this loop.
 330   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
 331        i < e; ++i) {
 332     EmittedInstrs.push_front(nullptr);
 333   }
 334
 335   // getMaxLookahead() is the largest number of wait states we will ever need
 336   // to insert, so there is no point in keeping track of more than that many
 337   // wait states.
 338   EmittedInstrs.resize(getMaxLookAhead());
 339
 340   CurrCycleInstr = nullptr;
 341 }
 342
 343 void GCNHazardRecognizer::RecedeCycle() {
 344   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
 345 }
 346
 347 //===----------------------------------------------------------------------===//
 348 // Helper Functions
 349 //===----------------------------------------------------------------------===//
 350
 351 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
 352
 353 // Returns a minimum wait states since \p I walking all predecessors.
 354 // Only scans until \p IsExpired does not return true.
 355 // Can only be run in a hazard recognizer mode.
 356 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
 357                               MachineBasicBlock *MBB,
 358                               MachineBasicBlock::reverse_instr_iterator I,
 359                               int WaitStates,
 360                               IsExpiredFn IsExpired,
 361                               DenseSet<const MachineBasicBlock *> &Visited) {
 362   for (auto E = MBB->instr_rend(); I != E; ++I) {
 363     // Don't add WaitStates for parent BUNDLE instructions.
 364     if (I->isBundle())
 365       continue;
 366
 367     if (IsHazard(&*I))
 368       return WaitStates;
 369
 370     if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
 371       continue;
 372
 373     WaitStates += SIInstrInfo::getNumWaitStates(*I);
 374
 375     if (IsExpired(&*I, WaitStates))
 376       return std::numeric_limits<int>::max();
 377   }
 378
 379   int MinWaitStates = WaitStates;
 380   bool Found = false;
 381   for (MachineBasicBlock *Pred : MBB->predecessors()) {
 382     if (!Visited.insert(Pred).second)
 383       continue;
 384
 385     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
 386                                WaitStates, IsExpired, Visited);
 387
 388     if (W == std::numeric_limits<int>::max())
 389       continue;
 390
 391     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
 392     if (IsExpired(nullptr, MinWaitStates))
 393       return MinWaitStates;
 394
 395     Found = true;
 396   }
 397
 398   if (Found)
 399     return MinWaitStates;
 400
 401   return std::numeric_limits<int>::max();
 402 }
 403
 404 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
 405                               MachineInstr *MI,
 406                               IsExpiredFn IsExpired) {
 407   DenseSet<const MachineBasicBlock *> Visited;
 408   return getWaitStatesSince(IsHazard, MI->getParent(),
 409                             std::next(MI->getReverseIterator()),
 410                             0, IsExpired, Visited);
 411 }
 412
 413 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
 414   if (IsHazardRecognizerMode) {
 415     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
 416       return WaitStates >= Limit;
 417     };
 418     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
 419   }
 420
 421   int WaitStates = 0;
 422   for (MachineInstr *MI : EmittedInstrs) {
 423     if (MI) {
 424       if (IsHazard(MI))
 425         return WaitStates;
 426
 427       if (MI->isInlineAsm())
 428         continue;
 429     }
 430     ++WaitStates;
 431
 432     if (WaitStates >= Limit)
 433       break;
 434   }
 435   return std::numeric_limits<int>::max();
 436 }
 437
 438 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
 439                                                IsHazardFn IsHazardDef,
 440                                                int Limit) {
 441   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 442
 443   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
 444     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
 445   };
 446
 447   return getWaitStatesSince(IsHazardFn, Limit);
 448 }
 449
 450 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
 451                                                   int Limit) {
 452   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
 453     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
 454   };
 455
 456   return getWaitStatesSince(IsHazardFn, Limit);
 457 }
 458
 459 //===----------------------------------------------------------------------===//
 460 // No-op Hazard Detection
 461 //===----------------------------------------------------------------------===//
 462
 463 static void addRegUnits(const SIRegisterInfo &TRI,
 464                         BitVector &BV, unsigned Reg) {
 465   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
 466     BV.set(*RUI);
 467 }
 468
 469 static void addRegsToSet(const SIRegisterInfo &TRI,
 470                          iterator_range<MachineInstr::const_mop_iterator> Ops,
 471                          BitVector &Set) {
 472   for (const MachineOperand &Op : Ops) {
 473     if (Op.isReg())
 474       addRegUnits(TRI, Set, Op.getReg());
 475   }
 476 }
 477
 478 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
 479   // XXX: Do we need to worry about implicit operands
 480   addRegsToSet(TRI, MI.defs(), ClauseDefs);
 481   addRegsToSet(TRI, MI.uses(), ClauseUses);
 482 }
 483
 484 static bool breaksSMEMSoftClause(MachineInstr *MI) {
 485   return !SIInstrInfo::isSMRD(*MI);
 486 }
 487
 488 static bool breaksVMEMSoftClause(MachineInstr *MI) {
 489   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
 490 }
 491
 492 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
 493   // SMEM soft clause are only present on VI+, and only matter if xnack is
 494   // enabled.
 495   if (!ST.isXNACKEnabled())
 496     return 0;
 497
 498   bool IsSMRD = TII.isSMRD(*MEM);
 499
 500   resetClause();
 501
 502   // A soft-clause is any group of consecutive SMEM instructions.  The
 503   // instructions in this group may return out of order and/or may be
 504   // replayed (i.e. the same instruction issued more than once).
 505   //
 506   // In order to handle these situations correctly we need to make sure that
 507   // when a clause has more than one instruction, no instruction in the clause
 508   // writes to a register that is read by another instruction in the clause
 509   // (including itself). If we encounter this situaion, we need to break the
 510   // clause by inserting a non SMEM instruction.
 511
 512   for (MachineInstr *MI : EmittedInstrs) {
 513     // When we hit a non-SMEM instruction then we have passed the start of the
 514     // clause and we can stop.
 515     if (!MI)
 516       break;
 517
 518     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
 519       break;
 520
 521     addClauseInst(*MI);
 522   }
 523
 524   if (ClauseDefs.none())
 525     return 0;
 526
 527   // We need to make sure not to put loads and stores in the same clause if they
 528   // use the same address. For now, just start a new clause whenever we see a
 529   // store.
 530   if (MEM->mayStore())
 531     return 1;
 532
 533   addClauseInst(*MEM);
 534
 535   // If the set of defs and uses intersect then we cannot add this instruction
 536   // to the clause, so we have a hazard.
 537   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
 538 }
 539
 540 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
 541   int WaitStatesNeeded = 0;
 542
 543   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
 544
 545   // This SMRD hazard only affects SI.
 546   if (!ST.hasSMRDReadVALUDefHazard())
 547     return WaitStatesNeeded;
 548
 549   // A read of an SGPR by SMRD instruction requires 4 wait states when the
 550   // SGPR was written by a VALU instruction.
 551   int SmrdSgprWaitStates = 4;
 552   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
 553   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
 554
 555   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
 556
 557   for (const MachineOperand &Use : SMRD->uses()) {
 558     if (!Use.isReg())
 559       continue;
 560     int WaitStatesNeededForUse =
 561         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
 562                                                    SmrdSgprWaitStates);
 563     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 564
 565     // This fixes what appears to be undocumented hardware behavior in SI where
 566     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
 567     // needs some number of nops in between. We don't know how many we need, but
 568     // let's use 4. This wasn't discovered before probably because the only
 569     // case when this happens is when we expand a 64-bit pointer into a full
 570     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
 571     // probably never encountered in the closed-source land.
 572     if (IsBufferSMRD) {
 573       int WaitStatesNeededForUse =
 574         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
 575                                                    IsBufferHazardDefFn,
 576                                                    SmrdSgprWaitStates);
 577       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 578     }
 579   }
 580
 581   return WaitStatesNeeded;
 582 }
 583
 584 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
 585   if (!ST.hasVMEMReadSGPRVALUDefHazard())
 586     return 0;
 587
 588   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
 589
 590   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
 591   // SGPR was written by a VALU Instruction.
 592   const int VmemSgprWaitStates = 5;
 593   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
 594   for (const MachineOperand &Use : VMEM->uses()) {
 595     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
 596       continue;
 597
 598     int WaitStatesNeededForUse =
 599         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
 600                                                    VmemSgprWaitStates);
 601     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 602   }
 603   return WaitStatesNeeded;
 604 }
 605
 606 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
 607   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 608   const SIInstrInfo *TII = ST.getInstrInfo();
 609
 610   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
 611   int DppVgprWaitStates = 2;
 612   int DppExecWaitStates = 5;
 613   int WaitStatesNeeded = 0;
 614   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
 615
 616   for (const MachineOperand &Use : DPP->uses()) {
 617     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
 618       continue;
 619     int WaitStatesNeededForUse =
 620         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
 621                               [](MachineInstr *) { return true; },
 622                               DppVgprWaitStates);
 623     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 624   }
 625
 626   WaitStatesNeeded = std::max(
 627       WaitStatesNeeded,
 628       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
 629                                                 DppExecWaitStates));
 630
 631   return WaitStatesNeeded;
 632 }
 633
 634 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
 635   const SIInstrInfo *TII = ST.getInstrInfo();
 636
 637   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
 638   // instruction.
 639   const int DivFMasWaitStates = 4;
 640   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
 641   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
 642                                                DivFMasWaitStates);
 643
 644   return DivFMasWaitStates - WaitStatesNeeded;
 645 }
 646
 647 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
 648   const SIInstrInfo *TII = ST.getInstrInfo();
 649   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
 650
 651   const int GetRegWaitStates = 2;
 652   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
 653     return GetRegHWReg == getHWReg(TII, *MI);
 654   };
 655   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
 656
 657   return GetRegWaitStates - WaitStatesNeeded;
 658 }
 659
 660 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
 661   const SIInstrInfo *TII = ST.getInstrInfo();
 662   unsigned HWReg = getHWReg(TII, *SetRegInstr);
 663
 664   const int SetRegWaitStates = ST.getSetRegWaitStates();
 665   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
 666     return HWReg == getHWReg(TII, *MI);
 667   };
 668   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
 669   return SetRegWaitStates - WaitStatesNeeded;
 670 }
 671
 672 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
 673   if (!MI.mayStore())
 674     return -1;
 675
 676   const SIInstrInfo *TII = ST.getInstrInfo();
 677   unsigned Opcode = MI.getOpcode();
 678   const MCInstrDesc &Desc = MI.getDesc();
 679
 680   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
 681   int VDataRCID = -1;
 682   if (VDataIdx != -1)
 683     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
 684
 685   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
 686     // There is no hazard if the instruction does not use vector regs
 687     // (like wbinvl1)
 688     if (VDataIdx == -1)
 689       return -1;
 690     // For MUBUF/MTBUF instructions this hazard only exists if the
 691     // instruction is not using a register in the soffset field.
 692     const MachineOperand *SOffset =
 693         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
 694     // If we have no soffset operand, then assume this field has been
 695     // hardcoded to zero.
 696     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
 697         (!SOffset || !SOffset->isReg()))
 698       return VDataIdx;
 699   }
 700
 701   // MIMG instructions create a hazard if they don't use a 256-bit T# and
 702   // the store size is greater than 8 bytes and they have more than two bits
 703   // of their dmask set.
 704   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
 705   if (TII->isMIMG(MI)) {
 706     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
 707     assert(SRsrcIdx != -1 &&
 708            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
 709     (void)SRsrcIdx;
 710   }
 711
 712   if (TII->isFLAT(MI)) {
 713     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
 714     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
 715       return DataIdx;
 716   }
 717
 718   return -1;
 719 }
 720
 721 int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 722                                                 const MachineRegisterInfo &MRI) {
 723   // Helper to check for the hazard where VMEM instructions that store more than
 724   // 8 bytes can have there store data over written by the next instruction.
 725   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 726
 727   const int VALUWaitStates = 1;
 728   int WaitStatesNeeded = 0;
 729
 730   if (!TRI->isVGPR(MRI, Def.getReg()))
 731     return WaitStatesNeeded;
 732   Register Reg = Def.getReg();
 733   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
 734     int DataIdx = createsVALUHazard(*MI);
 735     return DataIdx >= 0 &&
 736     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
 737   };
 738   int WaitStatesNeededForDef =
 739     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
 740   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
 741
 742   return WaitStatesNeeded;
 743 }
 744
 745 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
 746   // This checks for the hazard where VMEM instructions that store more than
 747   // 8 bytes can have there store data over written by the next instruction.
 748   if (!ST.has12DWordStoreHazard())
 749     return 0;
 750
 751   const MachineRegisterInfo &MRI = MF.getRegInfo();
 752   int WaitStatesNeeded = 0;
 753
 754   for (const MachineOperand &Def : VALU->defs()) {
 755     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
 756   }
 757
 758   return WaitStatesNeeded;
 759 }
 760
 761 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
 762   // This checks for hazards associated with inline asm statements.
 763   // Since inline asms can contain just about anything, we use this
 764   // to call/leverage other check*Hazard routines. Note that
 765   // this function doesn't attempt to address all possible inline asm
 766   // hazards (good luck), but is a collection of what has been
 767   // problematic thus far.
 768
 769   // see checkVALUHazards()
 770   if (!ST.has12DWordStoreHazard())
 771     return 0;
 772
 773   const MachineRegisterInfo &MRI = MF.getRegInfo();
 774   int WaitStatesNeeded = 0;
 775
 776   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
 777        I != E; ++I) {
 778     const MachineOperand &Op = IA->getOperand(I);
 779     if (Op.isReg() && Op.isDef()) {
 780       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
 781     }
 782   }
 783
 784   return WaitStatesNeeded;
 785 }
 786
 787 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
 788   const SIInstrInfo *TII = ST.getInstrInfo();
 789   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 790   const MachineRegisterInfo &MRI = MF.getRegInfo();
 791
 792   const MachineOperand *LaneSelectOp =
 793       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
 794
 795   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
 796     return 0;
 797
 798   Register LaneSelectReg = LaneSelectOp->getReg();
 799   auto IsHazardFn = [TII] (MachineInstr *MI) {
 800     return TII->isVALU(*MI);
 801   };
 802
 803   const int RWLaneWaitStates = 4;
 804   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
 805                                               RWLaneWaitStates);
 806   return RWLaneWaitStates - WaitStatesSince;
 807 }
 808
 809 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
 810   if (!ST.hasRFEHazards())
 811     return 0;
 812
 813   const SIInstrInfo *TII = ST.getInstrInfo();
 814
 815   const int RFEWaitStates = 1;
 816
 817   auto IsHazardFn = [TII] (MachineInstr *MI) {
 818     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
 819   };
 820   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
 821   return RFEWaitStates - WaitStatesNeeded;
 822 }
 823
 824 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
 825   if (MI->isDebugInstr())
 826     return 0;
 827
 828   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 829   if (!ST.hasSMovFedHazard())
 830     return 0;
 831
 832   // Check for any instruction reading an SGPR after a write from
 833   // s_mov_fed_b32.
 834   int MovFedWaitStates = 1;
 835   int WaitStatesNeeded = 0;
 836
 837   for (const MachineOperand &Use : MI->uses()) {
 838     if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
 839       continue;
 840     auto IsHazardFn = [] (MachineInstr *MI) {
 841       return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
 842     };
 843     int WaitStatesNeededForUse =
 844         MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
 845                                                  MovFedWaitStates);
 846     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 847   }
 848
 849   return WaitStatesNeeded;
 850 }
 851
 852 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
 853   const SIInstrInfo *TII = ST.getInstrInfo();
 854   const int SMovRelWaitStates = 1;
 855   auto IsHazardFn = [TII] (MachineInstr *MI) {
 856     return TII->isSALU(*MI);
 857   };
 858   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
 859                                                    SMovRelWaitStates);
 860 }
 861
 862 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
 863   fixVMEMtoScalarWriteHazards(MI);
 864   fixVcmpxPermlaneHazards(MI);
 865   fixSMEMtoVectorWriteHazards(MI);
 866   fixVcmpxExecWARHazard(MI);
 867   fixLdsBranchVmemWARHazard(MI);
 868 }
 869
 870 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
 871   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
 872     return false;
 873
 874   const SIInstrInfo *TII = ST.getInstrInfo();
 875   auto IsHazardFn = [TII] (MachineInstr *MI) {
 876     return TII->isVOPC(*MI);
 877   };
 878
 879   auto IsExpiredFn = [] (MachineInstr *MI, int) {
 880     if (!MI)
 881       return false;
 882     unsigned Opc = MI->getOpcode();
 883     return SIInstrInfo::isVALU(*MI) &&
 884            Opc != AMDGPU::V_NOP_e32 &&
 885            Opc != AMDGPU::V_NOP_e64 &&
 886            Opc != AMDGPU::V_NOP_sdwa;
 887   };
 888
 889   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
 890       std::numeric_limits<int>::max())
 891     return false;
 892
 893   // V_NOP will be discarded by SQ.
 894   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
 895   // which is always a VGPR and available.
 896   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
 897   Register Reg = Src0->getReg();
 898   bool IsUndef = Src0->isUndef();
 899   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
 900           TII->get(AMDGPU::V_MOV_B32_e32))
 901     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
 902     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
 903
 904   return true;
 905 }
 906
 907 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
 908   if (!ST.hasVMEMtoScalarWriteHazard())
 909     return false;
 910
 911   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
 912     return false;
 913
 914   if (MI->getNumDefs() == 0)
 915     return false;
 916
 917   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 918
 919   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
 920     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
 921         !SIInstrInfo::isFLAT(*I))
 922       return false;
 923
 924     for (const MachineOperand &Def : MI->defs()) {
 925       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
 926       if (!Op)
 927         continue;
 928       return true;
 929     }
 930     return false;
 931   };
 932
 933   auto IsExpiredFn = [] (MachineInstr *MI, int) {
 934     return MI && (SIInstrInfo::isVALU(*MI) ||
 935                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
 936                    !MI->getOperand(0).getImm()));
 937   };
 938
 939   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
 940       std::numeric_limits<int>::max())
 941     return false;
 942
 943   const SIInstrInfo *TII = ST.getInstrInfo();
 944   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
 945   return true;
 946 }
 947
 948 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
 949   if (!ST.hasSMEMtoVectorWriteHazard())
 950     return false;
 951
 952   if (!SIInstrInfo::isVALU(*MI))
 953     return false;
 954
 955   unsigned SDSTName;
 956   switch (MI->getOpcode()) {
 957   case AMDGPU::V_READLANE_B32:
 958   case AMDGPU::V_READLANE_B32_gfx10:
 959   case AMDGPU::V_READFIRSTLANE_B32:
 960     SDSTName = AMDGPU::OpName::vdst;
 961     break;
 962   default:
 963     SDSTName = AMDGPU::OpName::sdst;
 964     break;
 965   }
 966
 967   const SIInstrInfo *TII = ST.getInstrInfo();
 968   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 969   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
 970   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
 971   if (!SDST) {
 972     for (const auto &MO : MI->implicit_operands()) {
 973       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
 974         SDST = &MO;
 975         break;
 976       }
 977     }
 978   }
 979
 980   if (!SDST)
 981     return false;
 982
 983   const Register SDSTReg = SDST->getReg();
 984   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
 985     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
 986   };
 987
 988   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
 989     if (MI) {
 990       if (TII->isSALU(*MI)) {
 991         switch (MI->getOpcode()) {
 992         case AMDGPU::S_SETVSKIP:
 993         case AMDGPU::S_VERSION:
 994         case AMDGPU::S_WAITCNT_VSCNT:
 995         case AMDGPU::S_WAITCNT_VMCNT:
 996         case AMDGPU::S_WAITCNT_EXPCNT:
 997           // These instructions cannot not mitigate the hazard.
 998           return false;
 999         case AMDGPU::S_WAITCNT_LGKMCNT:
1000           // Reducing lgkmcnt count to 0 always mitigates the hazard.
1001           return (MI->getOperand(1).getImm() == 0) &&
1002                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1003         case AMDGPU::S_WAITCNT: {
1004           const int64_t Imm = MI->getOperand(0).getImm();
1005           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1006           return (Decoded.LgkmCnt == 0);
1007         }
1008         default:
1009           // SOPP instructions cannot mitigate the hazard.
1010           if (TII->isSOPP(*MI))
1011             return false;
1012           // At this point the SALU can be assumed to mitigate the hazard
1013           // because either:
1014           // (a) it is independent of the at risk SMEM (breaking chain),
1015           // or
1016           // (b) it is dependent on the SMEM, in which case an appropriate
1017           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1018           //     SMEM instruction.
1019           return true;
1020         }
1021       }
1022     }
1023     return false;
1024   };
1025
1026   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1027       std::numeric_limits<int>::max())
1028     return false;
1029
1030   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1031           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1032       .addImm(0);
1033   return true;
1034 }
1035
1036 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1037   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1038     return false;
1039
1040   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1041   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1042     return false;
1043
1044   auto IsHazardFn = [TRI] (MachineInstr *I) {
1045     if (SIInstrInfo::isVALU(*I))
1046       return false;
1047     return I->readsRegister(AMDGPU::EXEC, TRI);
1048   };
1049
1050   const SIInstrInfo *TII = ST.getInstrInfo();
1051   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1052     if (!MI)
1053       return false;
1054     if (SIInstrInfo::isVALU(*MI)) {
1055       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1056         return true;
1057       for (auto MO : MI->implicit_operands())
1058         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1059           return true;
1060     }
1061     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1062         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1063       return true;
1064     return false;
1065   };
1066
1067   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1068       std::numeric_limits<int>::max())
1069     return false;
1070
1071   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1072           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1073     .addImm(0xfffe);
1074   return true;
1075 }
1076
1077 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1078   if (!ST.hasLdsBranchVmemWARHazard())
1079     return false;
1080
1081   auto IsHazardInst = [] (const MachineInstr *MI) {
1082     if (SIInstrInfo::isDS(*MI))
1083       return 1;
1084     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1085       return 2;
1086     return 0;
1087   };
1088
1089   auto InstType = IsHazardInst(MI);
1090   if (!InstType)
1091     return false;
1092
1093   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1094     return I && (IsHazardInst(I) ||
1095                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1096                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1097                   !I->getOperand(1).getImm()));
1098   };
1099
1100   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1101     if (!I->isBranch())
1102       return false;
1103
1104     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1105       auto InstType2 = IsHazardInst(I);
1106       return InstType2 && InstType != InstType2;
1107     };
1108
1109     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1110       if (!I)
1111         return false;
1112
1113       auto InstType2 = IsHazardInst(I);
1114       if (InstType == InstType2)
1115         return true;
1116
1117       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1118              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1119              !I->getOperand(1).getImm();
1120     };
1121
1122     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1123            std::numeric_limits<int>::max();
1124   };
1125
1126   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1127       std::numeric_limits<int>::max())
1128     return false;
1129
1130   const SIInstrInfo *TII = ST.getInstrInfo();
1131   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1132           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1133     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1134     .addImm(0);
1135
1136   return true;
1137 }
1138
1139 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1140   int NSAtoVMEMWaitStates = 1;
1141
1142   if (!ST.hasNSAtoVMEMBug())
1143     return 0;
1144
1145   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1146     return 0;
1147
1148   const SIInstrInfo *TII = ST.getInstrInfo();
1149   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1150   if (!Offset || (Offset->getImm() & 6) == 0)
1151     return 0;
1152
1153   auto IsHazardFn = [TII] (MachineInstr *I) {
1154     if (!SIInstrInfo::isMIMG(*I))
1155       return false;
1156     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1157     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1158            TII->getInstSizeInBytes(*I) >= 16;
1159   };
1160
1161   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1162 }
1163
1164 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1165   int FPAtomicToDenormModeWaitStates = 3;
1166
1167   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1168     return 0;
1169
1170   auto IsHazardFn = [] (MachineInstr *I) {
1171     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1172       return false;
1173     return SIInstrInfo::isFPAtomic(*I);
1174   };
1175
1176   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1177     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1178       return true;
1179
1180     switch (MI->getOpcode()) {
1181     case AMDGPU::S_WAITCNT:
1182     case AMDGPU::S_WAITCNT_VSCNT:
1183     case AMDGPU::S_WAITCNT_VMCNT:
1184     case AMDGPU::S_WAITCNT_EXPCNT:
1185     case AMDGPU::S_WAITCNT_LGKMCNT:
1186     case AMDGPU::S_WAITCNT_IDLE:
1187       return true;
1188     default:
1189       break;
1190     }
1191
1192     return false;
1193   };
1194
1195
1196   return FPAtomicToDenormModeWaitStates -
1197          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1198 }
1199
1200 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1201   assert(SIInstrInfo::isMAI(*MI));
1202
1203   int WaitStatesNeeded = 0;
1204   unsigned Opc = MI->getOpcode();
1205
1206   auto IsVALUFn = [] (MachineInstr *MI) {
1207     return SIInstrInfo::isVALU(*MI);
1208   };
1209
1210   if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1211     const int LegacyVALUWritesVGPRWaitStates = 2;
1212     const int VALUWritesExecWaitStates = 4;
1213     const int MaxWaitStates = 4;
1214
1215     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1216       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1217     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1218
1219     if (WaitStatesNeeded < MaxWaitStates) {
1220       for (const MachineOperand &Use : MI->explicit_uses()) {
1221         const int MaxWaitStates = 2;
1222
1223         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1224           continue;
1225
1226         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1227           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1228         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1229
1230         if (WaitStatesNeeded == MaxWaitStates)
1231           break;
1232       }
1233     }
1234   }
1235
1236   auto IsMFMAFn = [] (MachineInstr *MI) {
1237     return SIInstrInfo::isMAI(*MI) &&
1238            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1239            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1240   };
1241
1242   for (const MachineOperand &Op : MI->explicit_operands()) {
1243     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1244       continue;
1245
1246     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1247       continue;
1248
1249     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1250     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1251     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1252     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1253     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1254     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1255     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1256     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1257     const int MaxWaitStates = 18;
1258     Register Reg = Op.getReg();
1259     unsigned HazardDefLatency = 0;
1260
1261     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1262                               (MachineInstr *MI) {
1263       if (!IsMFMAFn(MI))
1264         return false;
1265       Register DstReg = MI->getOperand(0).getReg();
1266       if (DstReg == Reg)
1267         return false;
1268       HazardDefLatency = std::max(HazardDefLatency,
1269                                   TSchedModel.computeInstrLatency(MI));
1270       return TRI.regsOverlap(DstReg, Reg);
1271     };
1272
1273     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1274                                                    MaxWaitStates);
1275     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1276     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1277     int OpNo = MI->getOperandNo(&Op);
1278     if (OpNo == SrcCIdx) {
1279       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1280     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1281       switch (HazardDefLatency) {
1282       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1283                break;
1284       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1285                break;
1286       case 16: LLVM_FALLTHROUGH;
1287       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1288                break;
1289       }
1290     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1291       switch (HazardDefLatency) {
1292       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1293                break;
1294       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1295                break;
1296       case 16: LLVM_FALLTHROUGH;
1297       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1298                break;
1299       }
1300     }
1301
1302     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1303     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1304
1305     if (WaitStatesNeeded == MaxWaitStates)
1306       return WaitStatesNeeded; // Early exit.
1307
1308     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1309       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1310         return false;
1311       Register DstReg = MI->getOperand(0).getReg();
1312       return TRI.regsOverlap(Reg, DstReg);
1313     };
1314
1315     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1316     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1317     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1318     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1319     if (OpNo == SrcCIdx)
1320       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1321     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1322       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1323
1324     WaitStatesNeededForUse = NeedWaitStates -
1325       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1326     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1327
1328     if (WaitStatesNeeded == MaxWaitStates)
1329       return WaitStatesNeeded; // Early exit.
1330   }
1331
1332   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1333     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1334     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1335     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1336     const int MaxWaitStates = 13;
1337     Register DstReg = MI->getOperand(0).getReg();
1338     unsigned HazardDefLatency = 0;
1339
1340     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1341                          (MachineInstr *MI) {
1342       if (!IsMFMAFn(MI))
1343         return false;
1344       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1345       HazardDefLatency = std::max(HazardDefLatency,
1346                                   TSchedModel.computeInstrLatency(MI));
1347       return TRI.regsOverlap(Reg, DstReg);
1348     };
1349
1350     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1351     int NeedWaitStates;
1352     switch (HazardDefLatency) {
1353     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1354              break;
1355     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1356              break;
1357     case 16: LLVM_FALLTHROUGH;
1358     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1359              break;
1360     }
1361
1362     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1363     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1364   }
1365
1366   return WaitStatesNeeded;
1367 }
1368
1369 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1370   if (!ST.hasMAIInsts())
1371     return 0;
1372
1373   int WaitStatesNeeded = 0;
1374
1375   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1376     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1377   };
1378
1379   for (const MachineOperand &Op : MI->explicit_uses()) {
1380     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1381       continue;
1382
1383     Register Reg = Op.getReg();
1384
1385     const int AccVgprReadLdStWaitStates = 2;
1386     const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1387     const int MaxWaitStates = 2;
1388
1389     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1390       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1391     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1392
1393     if (WaitStatesNeeded == MaxWaitStates)
1394       return WaitStatesNeeded; // Early exit.
1395
1396     auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1397       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1398         return false;
1399       auto IsVALUFn = [] (MachineInstr *MI) {
1400         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1401       };
1402       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1403              std::numeric_limits<int>::max();
1404     };
1405
1406     WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1407       getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1408     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1409   }
1410
1411   return WaitStatesNeeded;
1412 }