contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPUSubtarget.h"
  16 #include "AMDGPU.h"
  17 #include "AMDGPUTargetMachine.h"
  18 #include "AMDGPUCallLowering.h"
  19 #include "AMDGPUInstructionSelector.h"
  20 #include "AMDGPULegalizerInfo.h"
  21 #include "AMDGPURegisterBankInfo.h"
  22 #include "SIMachineFunctionInfo.h"
  23 #include "llvm/ADT/SmallString.h"
  24 #include "llvm/CodeGen/MachineScheduler.h"
  25 #include "llvm/IR/MDBuilder.h"
  26 #include "llvm/CodeGen/TargetFrameLowering.h"
  27 #include <algorithm>
  28
  29 using namespace llvm;
  30
  31 #define DEBUG_TYPE "amdgpu-subtarget"
  32
  33 #define GET_SUBTARGETINFO_TARGET_DESC
  34 #define GET_SUBTARGETINFO_CTOR
  35 #include "AMDGPUGenSubtargetInfo.inc"
  36
  37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
  38
  39 AMDGPUSubtarget &
  40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
  41                                                  StringRef GPU, StringRef FS) {
  42   // Determine default and user-specified characteristics
  43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  44   // enabled, but some instructions do not respect them and they run at the
  45   // double precision rate, so don't enable by default.
  46   //
  47   // We want to be able to turn these off, but making this a subtarget feature
  48   // for SI has the unhelpful behavior that it unsets everything else if you
  49   // disable it.
  50
  51   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
  52
  53   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  54     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  55
  56   // FIXME: I don't think think Evergreen has any useful support for
  57   // denormals, but should be checked. Should we issue a warning somewhere
  58   // if someone tries to enable these?
  59   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
  60     FullFS += "+fp64-fp16-denormals,";
  61   } else {
  62     FullFS += "-fp32-denormals,";
  63   }
  64
  65   FullFS += FS;
  66
  67   ParseSubtargetFeatures(GPU, FullFS);
  68
  69   // We don't support FP64 for EG/NI atm.
  70   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
  71
  72   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
  73   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
  74   // variants of MUBUF instructions.
  75   if (!hasAddr64() && !FS.contains("flat-for-global")) {
  76     FlatForGlobal = true;
  77   }
  78
  79   // Set defaults if needed.
  80   if (MaxPrivateElementSize == 0)
  81     MaxPrivateElementSize = 4;
  82
  83   if (LDSBankCount == 0)
  84     LDSBankCount = 32;
  85
  86   if (TT.getArch() == Triple::amdgcn) {
  87     if (LocalMemorySize == 0)
  88       LocalMemorySize = 32768;
  89
  90     // Do something sensible for unspecified target.
  91     if (!HasMovrel && !HasVGPRIndexMode)
  92       HasMovrel = true;
  93   }
  94
  95   return *this;
  96 }
  97
  98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
  99                                  const TargetMachine &TM)
 100   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
 101     TargetTriple(TT),
 102     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
 103     IsaVersion(ISAVersion0_0_0),
 104     WavefrontSize(0),
 105     LocalMemorySize(0),
 106     LDSBankCount(0),
 107     MaxPrivateElementSize(0),
 108
 109     FastFMAF32(false),
 110     HalfRate64Ops(false),
 111
 112     FP32Denormals(false),
 113     FP64FP16Denormals(false),
 114     FPExceptions(false),
 115     DX10Clamp(false),
 116     FlatForGlobal(false),
 117     AutoWaitcntBeforeBarrier(false),
 118     CodeObjectV3(false),
 119     UnalignedScratchAccess(false),
 120     UnalignedBufferAccess(false),
 121
 122     HasApertureRegs(false),
 123     EnableXNACK(false),
 124     TrapHandler(false),
 125     DebuggerInsertNops(false),
 126     DebuggerReserveRegs(false),
 127     DebuggerEmitPrologue(false),
 128
 129     EnableHugePrivateBuffer(false),
 130     EnableVGPRSpilling(false),
 131     EnablePromoteAlloca(false),
 132     EnableLoadStoreOpt(false),
 133     EnableUnsafeDSOffsetFolding(false),
 134     EnableSIScheduler(false),
 135     DumpCode(false),
 136
 137     FP64(false),
 138     FMA(false),
 139     IsGCN(false),
 140     GCN3Encoding(false),
 141     CIInsts(false),
 142     GFX9Insts(false),
 143     SGPRInitBug(false),
 144     HasSMemRealTime(false),
 145     Has16BitInsts(false),
 146     HasIntClamp(false),
 147     HasVOP3PInsts(false),
 148     HasMadMixInsts(false),
 149     HasMovrel(false),
 150     HasVGPRIndexMode(false),
 151     HasScalarStores(false),
 152     HasInv2PiInlineImm(false),
 153     HasSDWA(false),
 154     HasSDWAOmod(false),
 155     HasSDWAScalar(false),
 156     HasSDWASdst(false),
 157     HasSDWAMac(false),
 158     HasSDWAOutModsVOPC(false),
 159     HasDPP(false),
 160     FlatAddressSpace(false),
 161     FlatInstOffsets(false),
 162     FlatGlobalInsts(false),
 163     FlatScratchInsts(false),
 164     AddNoCarryInsts(false),
 165
 166     R600ALUInst(false),
 167     CaymanISA(false),
 168     CFALUBug(false),
 169     HasVertexCache(false),
 170     TexVTXClauseSize(0),
 171     ScalarizeGlobal(false),
 172
 173     FeatureDisable(false),
 174     InstrItins(getInstrItineraryForCPU(GPU)) {
 175   AS = AMDGPU::getAMDGPUAS(TT);
 176   initializeSubtargetDependencies(TT, GPU, FS);
 177 }
 178
 179 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 180   const Function &F) const {
 181   if (NWaves == 1)
 182     return getLocalMemorySize();
 183   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 184   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 185   unsigned MaxWaves = getMaxWavesPerEU();
 186   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 187 }
 188
 189 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 190   const Function &F) const {
 191   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 192   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 193   unsigned MaxWaves = getMaxWavesPerEU();
 194   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 195   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 196   NumWaves = std::min(NumWaves, MaxWaves);
 197   NumWaves = std::max(NumWaves, 1u);
 198   return NumWaves;
 199 }
 200
 201 std::pair<unsigned, unsigned>
 202 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 203   switch (CC) {
 204   case CallingConv::AMDGPU_CS:
 205   case CallingConv::AMDGPU_KERNEL:
 206   case CallingConv::SPIR_KERNEL:
 207     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
 208   case CallingConv::AMDGPU_VS:
 209   case CallingConv::AMDGPU_LS:
 210   case CallingConv::AMDGPU_HS:
 211   case CallingConv::AMDGPU_ES:
 212   case CallingConv::AMDGPU_GS:
 213   case CallingConv::AMDGPU_PS:
 214     return std::make_pair(1, getWavefrontSize());
 215   default:
 216     return std::make_pair(1, 16 * getWavefrontSize());
 217   }
 218 }
 219
 220 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 221   const Function &F) const {
 222   // FIXME: 1024 if function.
 223   // Default minimum/maximum flat work group sizes.
 224   std::pair<unsigned, unsigned> Default =
 225     getDefaultFlatWorkGroupSize(F.getCallingConv());
 226
 227   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 228   // starts using "amdgpu-flat-work-group-size" attribute.
 229   Default.second = AMDGPU::getIntegerAttribute(
 230     F, "amdgpu-max-work-group-size", Default.second);
 231   Default.first = std::min(Default.first, Default.second);
 232
 233   // Requested minimum/maximum flat work group sizes.
 234   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 235     F, "amdgpu-flat-work-group-size", Default);
 236
 237   // Make sure requested minimum is less than requested maximum.
 238   if (Requested.first > Requested.second)
 239     return Default;
 240
 241   // Make sure requested values do not violate subtarget's specifications.
 242   if (Requested.first < getMinFlatWorkGroupSize())
 243     return Default;
 244   if (Requested.second > getMaxFlatWorkGroupSize())
 245     return Default;
 246
 247   return Requested;
 248 }
 249
 250 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 251   const Function &F) const {
 252   // Default minimum/maximum number of waves per execution unit.
 253   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 254
 255   // Default/requested minimum/maximum flat work group sizes.
 256   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 257
 258   // If minimum/maximum flat work group sizes were explicitly requested using
 259   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 260   // number of waves per execution unit to values implied by requested
 261   // minimum/maximum flat work group sizes.
 262   unsigned MinImpliedByFlatWorkGroupSize =
 263     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 264   bool RequestedFlatWorkGroupSize = false;
 265
 266   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 267   // starts using "amdgpu-flat-work-group-size" attribute.
 268   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
 269       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 270     Default.first = MinImpliedByFlatWorkGroupSize;
 271     RequestedFlatWorkGroupSize = true;
 272   }
 273
 274   // Requested minimum/maximum number of waves per execution unit.
 275   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 276     F, "amdgpu-waves-per-eu", Default, true);
 277
 278   // Make sure requested minimum is less than requested maximum.
 279   if (Requested.second && Requested.first > Requested.second)
 280     return Default;
 281
 282   // Make sure requested values do not violate subtarget's specifications.
 283   if (Requested.first < getMinWavesPerEU() ||
 284       Requested.first > getMaxWavesPerEU())
 285     return Default;
 286   if (Requested.second > getMaxWavesPerEU())
 287     return Default;
 288
 289   // Make sure requested values are compatible with values implied by requested
 290   // minimum/maximum flat work group sizes.
 291   if (RequestedFlatWorkGroupSize &&
 292       Requested.first < MinImpliedByFlatWorkGroupSize)
 293     return Default;
 294
 295   return Requested;
 296 }
 297
 298 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 299   Function *Kernel = I->getParent()->getParent();
 300   unsigned MinSize = 0;
 301   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 302   bool IdQuery = false;
 303
 304   // If reqd_work_group_size is present it narrows value down.
 305   if (auto *CI = dyn_cast<CallInst>(I)) {
 306     const Function *F = CI->getCalledFunction();
 307     if (F) {
 308       unsigned Dim = UINT_MAX;
 309       switch (F->getIntrinsicID()) {
 310       case Intrinsic::amdgcn_workitem_id_x:
 311       case Intrinsic::r600_read_tidig_x:
 312         IdQuery = true;
 313         LLVM_FALLTHROUGH;
 314       case Intrinsic::r600_read_local_size_x:
 315         Dim = 0;
 316         break;
 317       case Intrinsic::amdgcn_workitem_id_y:
 318       case Intrinsic::r600_read_tidig_y:
 319         IdQuery = true;
 320         LLVM_FALLTHROUGH;
 321       case Intrinsic::r600_read_local_size_y:
 322         Dim = 1;
 323         break;
 324       case Intrinsic::amdgcn_workitem_id_z:
 325       case Intrinsic::r600_read_tidig_z:
 326         IdQuery = true;
 327         LLVM_FALLTHROUGH;
 328       case Intrinsic::r600_read_local_size_z:
 329         Dim = 2;
 330         break;
 331       default:
 332         break;
 333       }
 334       if (Dim <= 3) {
 335         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 336           if (Node->getNumOperands() == 3)
 337             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 338                                   Node->getOperand(Dim))->getZExtValue();
 339       }
 340     }
 341   }
 342
 343   if (!MaxSize)
 344     return false;
 345
 346   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 347   // as Hi. For size query we need to pass Hi + 1.
 348   if (IdQuery)
 349     MinSize = 0;
 350   else
 351     ++MaxSize;
 352
 353   MDBuilder MDB(I->getContext());
 354   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 355                                                   APInt(32, MaxSize));
 356   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 357   return true;
 358 }
 359
 360 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 361                              const TargetMachine &TM) :
 362   AMDGPUSubtarget(TT, GPU, FS, TM),
 363   InstrInfo(*this),
 364   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 365   TLInfo(TM, *this) {}
 366
 367 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 368                          const TargetMachine &TM)
 369     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
 370       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 371       TLInfo(TM, *this) {
 372   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 373   Legalizer.reset(new AMDGPULegalizerInfo());
 374
 375   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
 376   InstSelector.reset(new AMDGPUInstructionSelector(
 377       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
 378 }
 379
 380 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 381                                       unsigned NumRegionInstrs) const {
 382   // Track register pressure so the scheduler can try to decrease
 383   // pressure once register usage is above the threshold defined by
 384   // SIRegisterInfo::getRegPressureSetLimit()
 385   Policy.ShouldTrackPressure = true;
 386
 387   // Enabling both top down and bottom up scheduling seems to give us less
 388   // register spills than just using one of these approaches on its own.
 389   Policy.OnlyTopDown = false;
 390   Policy.OnlyBottomUp = false;
 391
 392   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 393   if (!enableSIScheduler())
 394     Policy.ShouldTrackLaneMasks = true;
 395 }
 396
 397 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
 398   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 399 }
 400
 401 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
 402                                             unsigned ExplicitArgBytes) const {
 403   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
 404   if (ImplicitBytes == 0)
 405     return ExplicitArgBytes;
 406
 407   unsigned Alignment = getAlignmentForImplicitArgPtr();
 408   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 409 }
 410
 411 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 412   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
 413     if (SGPRs <= 80)
 414       return 10;
 415     if (SGPRs <= 88)
 416       return 9;
 417     if (SGPRs <= 100)
 418       return 8;
 419     return 7;
 420   }
 421   if (SGPRs <= 48)
 422     return 10;
 423   if (SGPRs <= 56)
 424     return 9;
 425   if (SGPRs <= 64)
 426     return 8;
 427   if (SGPRs <= 72)
 428     return 7;
 429   if (SGPRs <= 80)
 430     return 6;
 431   return 5;
 432 }
 433
 434 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 435   if (VGPRs <= 24)
 436     return 10;
 437   if (VGPRs <= 28)
 438     return 9;
 439   if (VGPRs <= 32)
 440     return 8;
 441   if (VGPRs <= 36)
 442     return 7;
 443   if (VGPRs <= 40)
 444     return 6;
 445   if (VGPRs <= 48)
 446     return 5;
 447   if (VGPRs <= 64)
 448     return 4;
 449   if (VGPRs <= 84)
 450     return 3;
 451   if (VGPRs <= 128)
 452     return 2;
 453   return 1;
 454 }
 455
 456 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 457   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 458   if (MFI.hasFlatScratchInit()) {
 459     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 460       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 461     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 462       return 4; // FLAT_SCRATCH, VCC (in that order).
 463   }
 464
 465   if (isXNACKEnabled())
 466     return 4; // XNACK, VCC (in that order).
 467   return 2; // VCC.
 468 }
 469
 470 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 471   const Function &F = MF.getFunction();
 472   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 473
 474   // Compute maximum number of SGPRs function can use using default/requested
 475   // minimum number of waves per execution unit.
 476   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 477   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 478   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 479
 480   // Check if maximum number of SGPRs was explicitly requested using
 481   // "amdgpu-num-sgpr" attribute.
 482   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 483     unsigned Requested = AMDGPU::getIntegerAttribute(
 484       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 485
 486     // Make sure requested value does not violate subtarget's specifications.
 487     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 488       Requested = 0;
 489
 490     // If more SGPRs are required to support the input user/system SGPRs,
 491     // increase to accommodate them.
 492     //
 493     // FIXME: This really ends up using the requested number of SGPRs + number
 494     // of reserved special registers in total. Theoretically you could re-use
 495     // the last input registers for these special registers, but this would
 496     // require a lot of complexity to deal with the weird aliasing.
 497     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 498     if (Requested && Requested < InputNumSGPRs)
 499       Requested = InputNumSGPRs;
 500
 501     // Make sure requested value is compatible with values implied by
 502     // default/requested minimum/maximum number of waves per execution unit.
 503     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 504       Requested = 0;
 505     if (WavesPerEU.second &&
 506         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 507       Requested = 0;
 508
 509     if (Requested)
 510       MaxNumSGPRs = Requested;
 511   }
 512
 513   if (hasSGPRInitBug())
 514     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 515
 516   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 517                   MaxAddressableNumSGPRs);
 518 }
 519
 520 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 521   const Function &F = MF.getFunction();
 522   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 523
 524   // Compute maximum number of VGPRs function can use using default/requested
 525   // minimum number of waves per execution unit.
 526   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 527   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 528
 529   // Check if maximum number of VGPRs was explicitly requested using
 530   // "amdgpu-num-vgpr" attribute.
 531   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 532     unsigned Requested = AMDGPU::getIntegerAttribute(
 533       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 534
 535     // Make sure requested value does not violate subtarget's specifications.
 536     if (Requested && Requested <= getReservedNumVGPRs(MF))
 537       Requested = 0;
 538
 539     // Make sure requested value is compatible with values implied by
 540     // default/requested minimum/maximum number of waves per execution unit.
 541     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 542       Requested = 0;
 543     if (WavesPerEU.second &&
 544         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 545       Requested = 0;
 546
 547     if (Requested)
 548       MaxNumVGPRs = Requested;
 549   }
 550
 551   return MaxNumVGPRs - getReservedNumVGPRs(MF);
 552 }
 553
 554 namespace {
 555 struct MemOpClusterMutation : ScheduleDAGMutation {
 556   const SIInstrInfo *TII;
 557
 558   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
 559
 560   void apply(ScheduleDAGInstrs *DAGInstrs) override {
 561     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 562
 563     SUnit *SUa = nullptr;
 564     // Search for two consequent memory operations and link them
 565     // to prevent scheduler from moving them apart.
 566     // In DAG pre-process SUnits are in the original order of
 567     // the instructions before scheduling.
 568     for (SUnit &SU : DAG->SUnits) {
 569       MachineInstr &MI2 = *SU.getInstr();
 570       if (!MI2.mayLoad() && !MI2.mayStore()) {
 571         SUa = nullptr;
 572         continue;
 573       }
 574       if (!SUa) {
 575         SUa = &SU;
 576         continue;
 577       }
 578
 579       MachineInstr &MI1 = *SUa->getInstr();
 580       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
 581           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
 582           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
 583           (TII->isDS(MI1)   && TII->isDS(MI2))) {
 584         SU.addPredBarrier(SUa);
 585
 586         for (const SDep &SI : SU.Preds) {
 587           if (SI.getSUnit() != SUa)
 588             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
 589         }
 590
 591         if (&SU != &DAG->ExitSU) {
 592           for (const SDep &SI : SUa->Succs) {
 593             if (SI.getSUnit() != &SU)
 594               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
 595           }
 596         }
 597       }
 598
 599       SUa = &SU;
 600     }
 601   }
 602 };
 603 } // namespace
 604
 605 void SISubtarget::getPostRAMutations(
 606     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
 607   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
 608 }