contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPUSubtarget.h"
  16 #include "AMDGPU.h"
  17 #include "AMDGPUTargetMachine.h"
  18 #include "AMDGPUCallLowering.h"
  19 #include "AMDGPUInstructionSelector.h"
  20 #include "AMDGPULegalizerInfo.h"
  21 #include "AMDGPURegisterBankInfo.h"
  22 #include "SIMachineFunctionInfo.h"
  23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  24 #include "llvm/ADT/SmallString.h"
  25 #include "llvm/CodeGen/MachineScheduler.h"
  26 #include "llvm/MC/MCSubtargetInfo.h"
  27 #include "llvm/IR/MDBuilder.h"
  28 #include "llvm/CodeGen/TargetFrameLowering.h"
  29 #include <algorithm>
  30
  31 using namespace llvm;
  32
  33 #define DEBUG_TYPE "amdgpu-subtarget"
  34
  35 #define GET_SUBTARGETINFO_TARGET_DESC
  36 #define GET_SUBTARGETINFO_CTOR
  37 #define AMDGPUSubtarget GCNSubtarget
  38 #include "AMDGPUGenSubtargetInfo.inc"
  39 #define GET_SUBTARGETINFO_TARGET_DESC
  40 #define GET_SUBTARGETINFO_CTOR
  41 #undef AMDGPUSubtarget
  42 #include "R600GenSubtargetInfo.inc"
  43
  44 GCNSubtarget::~GCNSubtarget() = default;
  45
  46 R600Subtarget &
  47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
  48                                                StringRef GPU, StringRef FS) {
  49   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
  50   FullFS += FS;
  51   ParseSubtargetFeatures(GPU, FullFS);
  52
  53   // FIXME: I don't think think Evergreen has any useful support for
  54   // denormals, but should be checked. Should we issue a warning somewhere
  55   // if someone tries to enable these?
  56   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  57     FP32Denormals = false;
  58   }
  59
  60   HasMulU24 = getGeneration() >= EVERGREEN;
  61   HasMulI24 = hasCaymanISA();
  62
  63   return *this;
  64 }
  65
  66 GCNSubtarget &
  67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
  68                                                  StringRef GPU, StringRef FS) {
  69   // Determine default and user-specified characteristics
  70   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  71   // enabled, but some instructions do not respect them and they run at the
  72   // double precision rate, so don't enable by default.
  73   //
  74   // We want to be able to turn these off, but making this a subtarget feature
  75   // for SI has the unhelpful behavior that it unsets everything else if you
  76   // disable it.
  77   //
  78   // Similarly we want enable-prt-strict-null to be on by default and not to
  79   // unset everything else if it is disabled
  80
  81   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
  82
  83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  84     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  85
  86   // FIXME: I don't think think Evergreen has any useful support for
  87   // denormals, but should be checked. Should we issue a warning somewhere
  88   // if someone tries to enable these?
  89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
  90     FullFS += "+fp64-fp16-denormals,";
  91   } else {
  92     FullFS += "-fp32-denormals,";
  93   }
  94
  95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
  96
  97   FullFS += FS;
  98
  99   ParseSubtargetFeatures(GPU, FullFS);
 100
 101   // We don't support FP64 for EG/NI atm.
 102   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 103
 104   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
 105   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
 106   // variants of MUBUF instructions.
 107   if (!hasAddr64() && !FS.contains("flat-for-global")) {
 108     FlatForGlobal = true;
 109   }
 110
 111   // Set defaults if needed.
 112   if (MaxPrivateElementSize == 0)
 113     MaxPrivateElementSize = 4;
 114
 115   if (LDSBankCount == 0)
 116     LDSBankCount = 32;
 117
 118   if (TT.getArch() == Triple::amdgcn) {
 119     if (LocalMemorySize == 0)
 120       LocalMemorySize = 32768;
 121
 122     // Do something sensible for unspecified target.
 123     if (!HasMovrel && !HasVGPRIndexMode)
 124       HasMovrel = true;
 125   }
 126
 127   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 128
 129   return *this;
 130 }
 131
 132 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
 133   TargetTriple(TT),
 134   Has16BitInsts(false),
 135   HasMadMixInsts(false),
 136   FP32Denormals(false),
 137   FPExceptions(false),
 138   HasSDWA(false),
 139   HasVOP3PInsts(false),
 140   HasMulI24(true),
 141   HasMulU24(true),
 142   HasInv2PiInlineImm(false),
 143   HasFminFmaxLegacy(true),
 144   EnablePromoteAlloca(false),
 145   HasTrigReducedRange(false),
 146   LocalMemorySize(0),
 147   WavefrontSize(0)
 148   { }
 149
 150 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 151                            const GCNTargetMachine &TM) :
 152     AMDGPUGenSubtargetInfo(TT, GPU, FS),
 153     AMDGPUSubtarget(TT),
 154     TargetTriple(TT),
 155     Gen(SOUTHERN_ISLANDS),
 156     IsaVersion(ISAVersion0_0_0),
 157     InstrItins(getInstrItineraryForCPU(GPU)),
 158     LDSBankCount(0),
 159     MaxPrivateElementSize(0),
 160
 161     FastFMAF32(false),
 162     HalfRate64Ops(false),
 163
 164     FP64FP16Denormals(false),
 165     DX10Clamp(false),
 166     FlatForGlobal(false),
 167     AutoWaitcntBeforeBarrier(false),
 168     CodeObjectV3(false),
 169     UnalignedScratchAccess(false),
 170     UnalignedBufferAccess(false),
 171
 172     HasApertureRegs(false),
 173     EnableXNACK(false),
 174     TrapHandler(false),
 175     DebuggerInsertNops(false),
 176     DebuggerEmitPrologue(false),
 177
 178     EnableHugePrivateBuffer(false),
 179     EnableLoadStoreOpt(false),
 180     EnableUnsafeDSOffsetFolding(false),
 181     EnableSIScheduler(false),
 182     EnableDS128(false),
 183     EnablePRTStrictNull(false),
 184     DumpCode(false),
 185
 186     FP64(false),
 187     GCN3Encoding(false),
 188     CIInsts(false),
 189     VIInsts(false),
 190     GFX9Insts(false),
 191     SGPRInitBug(false),
 192     HasSMemRealTime(false),
 193     HasIntClamp(false),
 194     HasFmaMixInsts(false),
 195     HasMovrel(false),
 196     HasVGPRIndexMode(false),
 197     HasScalarStores(false),
 198     HasScalarAtomics(false),
 199     HasSDWAOmod(false),
 200     HasSDWAScalar(false),
 201     HasSDWASdst(false),
 202     HasSDWAMac(false),
 203     HasSDWAOutModsVOPC(false),
 204     HasDPP(false),
 205     HasR128A16(false),
 206     HasDLInsts(false),
 207     HasDotInsts(false),
 208     EnableSRAMECC(false),
 209     FlatAddressSpace(false),
 210     FlatInstOffsets(false),
 211     FlatGlobalInsts(false),
 212     FlatScratchInsts(false),
 213     AddNoCarryInsts(false),
 214     HasUnpackedD16VMem(false),
 215
 216     ScalarizeGlobal(false),
 217
 218     FeatureDisable(false),
 219     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
 220     TLInfo(TM, *this),
 221     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
 222   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 223   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
 224   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
 225   InstSelector.reset(new AMDGPUInstructionSelector(
 226   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 227 }
 228
 229 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 230   const Function &F) const {
 231   if (NWaves == 1)
 232     return getLocalMemorySize();
 233   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 234   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 235   unsigned MaxWaves = getMaxWavesPerEU();
 236   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 237 }
 238
 239 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 240   const Function &F) const {
 241   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 242   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 243   unsigned MaxWaves = getMaxWavesPerEU();
 244   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 245   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 246   NumWaves = std::min(NumWaves, MaxWaves);
 247   NumWaves = std::max(NumWaves, 1u);
 248   return NumWaves;
 249 }
 250
 251 unsigned
 252 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 253   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 254   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 255 }
 256
 257 std::pair<unsigned, unsigned>
 258 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 259   switch (CC) {
 260   case CallingConv::AMDGPU_CS:
 261   case CallingConv::AMDGPU_KERNEL:
 262   case CallingConv::SPIR_KERNEL:
 263     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
 264   case CallingConv::AMDGPU_VS:
 265   case CallingConv::AMDGPU_LS:
 266   case CallingConv::AMDGPU_HS:
 267   case CallingConv::AMDGPU_ES:
 268   case CallingConv::AMDGPU_GS:
 269   case CallingConv::AMDGPU_PS:
 270     return std::make_pair(1, getWavefrontSize());
 271   default:
 272     return std::make_pair(1, 16 * getWavefrontSize());
 273   }
 274 }
 275
 276 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 277   const Function &F) const {
 278   // FIXME: 1024 if function.
 279   // Default minimum/maximum flat work group sizes.
 280   std::pair<unsigned, unsigned> Default =
 281     getDefaultFlatWorkGroupSize(F.getCallingConv());
 282
 283   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 284   // starts using "amdgpu-flat-work-group-size" attribute.
 285   Default.second = AMDGPU::getIntegerAttribute(
 286     F, "amdgpu-max-work-group-size", Default.second);
 287   Default.first = std::min(Default.first, Default.second);
 288
 289   // Requested minimum/maximum flat work group sizes.
 290   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 291     F, "amdgpu-flat-work-group-size", Default);
 292
 293   // Make sure requested minimum is less than requested maximum.
 294   if (Requested.first > Requested.second)
 295     return Default;
 296
 297   // Make sure requested values do not violate subtarget's specifications.
 298   if (Requested.first < getMinFlatWorkGroupSize())
 299     return Default;
 300   if (Requested.second > getMaxFlatWorkGroupSize())
 301     return Default;
 302
 303   return Requested;
 304 }
 305
 306 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 307   const Function &F) const {
 308   // Default minimum/maximum number of waves per execution unit.
 309   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 310
 311   // Default/requested minimum/maximum flat work group sizes.
 312   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 313
 314   // If minimum/maximum flat work group sizes were explicitly requested using
 315   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 316   // number of waves per execution unit to values implied by requested
 317   // minimum/maximum flat work group sizes.
 318   unsigned MinImpliedByFlatWorkGroupSize =
 319     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 320   bool RequestedFlatWorkGroupSize = false;
 321
 322   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 323   // starts using "amdgpu-flat-work-group-size" attribute.
 324   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
 325       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 326     Default.first = MinImpliedByFlatWorkGroupSize;
 327     RequestedFlatWorkGroupSize = true;
 328   }
 329
 330   // Requested minimum/maximum number of waves per execution unit.
 331   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 332     F, "amdgpu-waves-per-eu", Default, true);
 333
 334   // Make sure requested minimum is less than requested maximum.
 335   if (Requested.second && Requested.first > Requested.second)
 336     return Default;
 337
 338   // Make sure requested values do not violate subtarget's specifications.
 339   if (Requested.first < getMinWavesPerEU() ||
 340       Requested.first > getMaxWavesPerEU())
 341     return Default;
 342   if (Requested.second > getMaxWavesPerEU())
 343     return Default;
 344
 345   // Make sure requested values are compatible with values implied by requested
 346   // minimum/maximum flat work group sizes.
 347   if (RequestedFlatWorkGroupSize &&
 348       Requested.first < MinImpliedByFlatWorkGroupSize)
 349     return Default;
 350
 351   return Requested;
 352 }
 353
 354 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 355   Function *Kernel = I->getParent()->getParent();
 356   unsigned MinSize = 0;
 357   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 358   bool IdQuery = false;
 359
 360   // If reqd_work_group_size is present it narrows value down.
 361   if (auto *CI = dyn_cast<CallInst>(I)) {
 362     const Function *F = CI->getCalledFunction();
 363     if (F) {
 364       unsigned Dim = UINT_MAX;
 365       switch (F->getIntrinsicID()) {
 366       case Intrinsic::amdgcn_workitem_id_x:
 367       case Intrinsic::r600_read_tidig_x:
 368         IdQuery = true;
 369         LLVM_FALLTHROUGH;
 370       case Intrinsic::r600_read_local_size_x:
 371         Dim = 0;
 372         break;
 373       case Intrinsic::amdgcn_workitem_id_y:
 374       case Intrinsic::r600_read_tidig_y:
 375         IdQuery = true;
 376         LLVM_FALLTHROUGH;
 377       case Intrinsic::r600_read_local_size_y:
 378         Dim = 1;
 379         break;
 380       case Intrinsic::amdgcn_workitem_id_z:
 381       case Intrinsic::r600_read_tidig_z:
 382         IdQuery = true;
 383         LLVM_FALLTHROUGH;
 384       case Intrinsic::r600_read_local_size_z:
 385         Dim = 2;
 386         break;
 387       default:
 388         break;
 389       }
 390       if (Dim <= 3) {
 391         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 392           if (Node->getNumOperands() == 3)
 393             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 394                                   Node->getOperand(Dim))->getZExtValue();
 395       }
 396     }
 397   }
 398
 399   if (!MaxSize)
 400     return false;
 401
 402   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 403   // as Hi. For size query we need to pass Hi + 1.
 404   if (IdQuery)
 405     MinSize = 0;
 406   else
 407     ++MaxSize;
 408
 409   MDBuilder MDB(I->getContext());
 410   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 411                                                   APInt(32, MaxSize));
 412   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 413   return true;
 414 }
 415
 416 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 417                                                  unsigned &MaxAlign) const {
 418   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 419          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 420
 421   const DataLayout &DL = F.getParent()->getDataLayout();
 422   uint64_t ExplicitArgBytes = 0;
 423   MaxAlign = 1;
 424
 425   for (const Argument &Arg : F.args()) {
 426     Type *ArgTy = Arg.getType();
 427
 428     unsigned Align = DL.getABITypeAlignment(ArgTy);
 429     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 430     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
 431     MaxAlign = std::max(MaxAlign, Align);
 432   }
 433
 434   return ExplicitArgBytes;
 435 }
 436
 437 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 438                                                 unsigned &MaxAlign) const {
 439   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 440
 441   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
 442
 443   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 444   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 445   if (ImplicitBytes != 0) {
 446     unsigned Alignment = getAlignmentForImplicitArgPtr();
 447     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 448   }
 449
 450   // Being able to dereference past the end is useful for emitting scalar loads.
 451   return alignTo(TotalSize, 4);
 452 }
 453
 454 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 455                              const TargetMachine &TM) :
 456   R600GenSubtargetInfo(TT, GPU, FS),
 457   AMDGPUSubtarget(TT),
 458   InstrInfo(*this),
 459   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 460   FMA(false),
 461   CaymanISA(false),
 462   CFALUBug(false),
 463   DX10Clamp(false),
 464   HasVertexCache(false),
 465   R600ALUInst(false),
 466   FP64(false),
 467   TexVTXClauseSize(0),
 468   Gen(R600),
 469   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
 470   InstrItins(getInstrItineraryForCPU(GPU)) { }
 471
 472 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 473                                       unsigned NumRegionInstrs) const {
 474   // Track register pressure so the scheduler can try to decrease
 475   // pressure once register usage is above the threshold defined by
 476   // SIRegisterInfo::getRegPressureSetLimit()
 477   Policy.ShouldTrackPressure = true;
 478
 479   // Enabling both top down and bottom up scheduling seems to give us less
 480   // register spills than just using one of these approaches on its own.
 481   Policy.OnlyTopDown = false;
 482   Policy.OnlyBottomUp = false;
 483
 484   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 485   if (!enableSIScheduler())
 486     Policy.ShouldTrackLaneMasks = true;
 487 }
 488
 489 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 490   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 491     if (SGPRs <= 80)
 492       return 10;
 493     if (SGPRs <= 88)
 494       return 9;
 495     if (SGPRs <= 100)
 496       return 8;
 497     return 7;
 498   }
 499   if (SGPRs <= 48)
 500     return 10;
 501   if (SGPRs <= 56)
 502     return 9;
 503   if (SGPRs <= 64)
 504     return 8;
 505   if (SGPRs <= 72)
 506     return 7;
 507   if (SGPRs <= 80)
 508     return 6;
 509   return 5;
 510 }
 511
 512 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 513   if (VGPRs <= 24)
 514     return 10;
 515   if (VGPRs <= 28)
 516     return 9;
 517   if (VGPRs <= 32)
 518     return 8;
 519   if (VGPRs <= 36)
 520     return 7;
 521   if (VGPRs <= 40)
 522     return 6;
 523   if (VGPRs <= 48)
 524     return 5;
 525   if (VGPRs <= 64)
 526     return 4;
 527   if (VGPRs <= 84)
 528     return 3;
 529   if (VGPRs <= 128)
 530     return 2;
 531   return 1;
 532 }
 533
 534 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 535   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 536   if (MFI.hasFlatScratchInit()) {
 537     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 538       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 539     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 540       return 4; // FLAT_SCRATCH, VCC (in that order).
 541   }
 542
 543   if (isXNACKEnabled())
 544     return 4; // XNACK, VCC (in that order).
 545   return 2; // VCC.
 546 }
 547
 548 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 549   const Function &F = MF.getFunction();
 550   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 551
 552   // Compute maximum number of SGPRs function can use using default/requested
 553   // minimum number of waves per execution unit.
 554   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 555   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 556   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 557
 558   // Check if maximum number of SGPRs was explicitly requested using
 559   // "amdgpu-num-sgpr" attribute.
 560   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 561     unsigned Requested = AMDGPU::getIntegerAttribute(
 562       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 563
 564     // Make sure requested value does not violate subtarget's specifications.
 565     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 566       Requested = 0;
 567
 568     // If more SGPRs are required to support the input user/system SGPRs,
 569     // increase to accommodate them.
 570     //
 571     // FIXME: This really ends up using the requested number of SGPRs + number
 572     // of reserved special registers in total. Theoretically you could re-use
 573     // the last input registers for these special registers, but this would
 574     // require a lot of complexity to deal with the weird aliasing.
 575     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 576     if (Requested && Requested < InputNumSGPRs)
 577       Requested = InputNumSGPRs;
 578
 579     // Make sure requested value is compatible with values implied by
 580     // default/requested minimum/maximum number of waves per execution unit.
 581     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 582       Requested = 0;
 583     if (WavesPerEU.second &&
 584         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 585       Requested = 0;
 586
 587     if (Requested)
 588       MaxNumSGPRs = Requested;
 589   }
 590
 591   if (hasSGPRInitBug())
 592     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 593
 594   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 595                   MaxAddressableNumSGPRs);
 596 }
 597
 598 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 599   const Function &F = MF.getFunction();
 600   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 601
 602   // Compute maximum number of VGPRs function can use using default/requested
 603   // minimum number of waves per execution unit.
 604   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 605   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 606
 607   // Check if maximum number of VGPRs was explicitly requested using
 608   // "amdgpu-num-vgpr" attribute.
 609   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 610     unsigned Requested = AMDGPU::getIntegerAttribute(
 611       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 612
 613     // Make sure requested value is compatible with values implied by
 614     // default/requested minimum/maximum number of waves per execution unit.
 615     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 616       Requested = 0;
 617     if (WavesPerEU.second &&
 618         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 619       Requested = 0;
 620
 621     if (Requested)
 622       MaxNumVGPRs = Requested;
 623   }
 624
 625   return MaxNumVGPRs;
 626 }
 627
 628 namespace {
 629 struct MemOpClusterMutation : ScheduleDAGMutation {
 630   const SIInstrInfo *TII;
 631
 632   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
 633
 634   void apply(ScheduleDAGInstrs *DAGInstrs) override {
 635     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 636
 637     SUnit *SUa = nullptr;
 638     // Search for two consequent memory operations and link them
 639     // to prevent scheduler from moving them apart.
 640     // In DAG pre-process SUnits are in the original order of
 641     // the instructions before scheduling.
 642     for (SUnit &SU : DAG->SUnits) {
 643       MachineInstr &MI2 = *SU.getInstr();
 644       if (!MI2.mayLoad() && !MI2.mayStore()) {
 645         SUa = nullptr;
 646         continue;
 647       }
 648       if (!SUa) {
 649         SUa = &SU;
 650         continue;
 651       }
 652
 653       MachineInstr &MI1 = *SUa->getInstr();
 654       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
 655           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
 656           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
 657           (TII->isDS(MI1)   && TII->isDS(MI2))) {
 658         SU.addPredBarrier(SUa);
 659
 660         for (const SDep &SI : SU.Preds) {
 661           if (SI.getSUnit() != SUa)
 662             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
 663         }
 664
 665         if (&SU != &DAG->ExitSU) {
 666           for (const SDep &SI : SUa->Succs) {
 667             if (SI.getSUnit() != &SU)
 668               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
 669           }
 670         }
 671       }
 672
 673       SUa = &SU;
 674     }
 675   }
 676 };
 677 } // namespace
 678
 679 void GCNSubtarget::getPostRAMutations(
 680     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
 681   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
 682 }
 683
 684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
 685   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
 686     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
 687   else
 688     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
 689 }
 690
 691 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
 692   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
 693     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
 694   else
 695     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
 696 }