contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPUSubtarget.h"
  16 #include "AMDGPU.h"
  17 #include "AMDGPUTargetMachine.h"
  18 #include "AMDGPUCallLowering.h"
  19 #include "AMDGPUInstructionSelector.h"
  20 #include "AMDGPULegalizerInfo.h"
  21 #include "AMDGPURegisterBankInfo.h"
  22 #include "SIMachineFunctionInfo.h"
  23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  24 #include "llvm/ADT/SmallString.h"
  25 #include "llvm/CodeGen/MachineScheduler.h"
  26 #include "llvm/MC/MCSubtargetInfo.h"
  27 #include "llvm/IR/MDBuilder.h"
  28 #include "llvm/CodeGen/TargetFrameLowering.h"
  29 #include <algorithm>
  30
  31 using namespace llvm;
  32
  33 #define DEBUG_TYPE "amdgpu-subtarget"
  34
  35 #define GET_SUBTARGETINFO_TARGET_DESC
  36 #define GET_SUBTARGETINFO_CTOR
  37 #define AMDGPUSubtarget GCNSubtarget
  38 #include "AMDGPUGenSubtargetInfo.inc"
  39 #define GET_SUBTARGETINFO_TARGET_DESC
  40 #define GET_SUBTARGETINFO_CTOR
  41 #undef AMDGPUSubtarget
  42 #include "R600GenSubtargetInfo.inc"
  43
  44 GCNSubtarget::~GCNSubtarget() = default;
  45
  46 R600Subtarget &
  47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
  48                                                StringRef GPU, StringRef FS) {
  49   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
  50   FullFS += FS;
  51   ParseSubtargetFeatures(GPU, FullFS);
  52
  53   // FIXME: I don't think think Evergreen has any useful support for
  54   // denormals, but should be checked. Should we issue a warning somewhere
  55   // if someone tries to enable these?
  56   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  57     FP32Denormals = false;
  58   }
  59
  60   HasMulU24 = getGeneration() >= EVERGREEN;
  61   HasMulI24 = hasCaymanISA();
  62
  63   return *this;
  64 }
  65
  66 GCNSubtarget &
  67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
  68                                                  StringRef GPU, StringRef FS) {
  69   // Determine default and user-specified characteristics
  70   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  71   // enabled, but some instructions do not respect them and they run at the
  72   // double precision rate, so don't enable by default.
  73   //
  74   // We want to be able to turn these off, but making this a subtarget feature
  75   // for SI has the unhelpful behavior that it unsets everything else if you
  76   // disable it.
  77
  78   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
  79
  80   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  81     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  82
  83   // FIXME: I don't think think Evergreen has any useful support for
  84   // denormals, but should be checked. Should we issue a warning somewhere
  85   // if someone tries to enable these?
  86   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
  87     FullFS += "+fp64-fp16-denormals,";
  88   } else {
  89     FullFS += "-fp32-denormals,";
  90   }
  91
  92   FullFS += FS;
  93
  94   ParseSubtargetFeatures(GPU, FullFS);
  95
  96   // We don't support FP64 for EG/NI atm.
  97   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
  98
  99   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
 100   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
 101   // variants of MUBUF instructions.
 102   if (!hasAddr64() && !FS.contains("flat-for-global")) {
 103     FlatForGlobal = true;
 104   }
 105
 106   // Set defaults if needed.
 107   if (MaxPrivateElementSize == 0)
 108     MaxPrivateElementSize = 4;
 109
 110   if (LDSBankCount == 0)
 111     LDSBankCount = 32;
 112
 113   if (TT.getArch() == Triple::amdgcn) {
 114     if (LocalMemorySize == 0)
 115       LocalMemorySize = 32768;
 116
 117     // Do something sensible for unspecified target.
 118     if (!HasMovrel && !HasVGPRIndexMode)
 119       HasMovrel = true;
 120   }
 121
 122   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 123
 124   return *this;
 125 }
 126
 127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
 128                                              const FeatureBitset &FeatureBits) :
 129   TargetTriple(TT),
 130   SubtargetFeatureBits(FeatureBits),
 131   Has16BitInsts(false),
 132   HasMadMixInsts(false),
 133   FP32Denormals(false),
 134   FPExceptions(false),
 135   HasSDWA(false),
 136   HasVOP3PInsts(false),
 137   HasMulI24(true),
 138   HasMulU24(true),
 139   HasFminFmaxLegacy(true),
 140   EnablePromoteAlloca(false),
 141   LocalMemorySize(0),
 142   WavefrontSize(0)
 143   { }
 144
 145 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 146                                  const GCNTargetMachine &TM) :
 147     AMDGPUGenSubtargetInfo(TT, GPU, FS),
 148     AMDGPUSubtarget(TT, getFeatureBits()),
 149     TargetTriple(TT),
 150     Gen(SOUTHERN_ISLANDS),
 151     IsaVersion(ISAVersion0_0_0),
 152     LDSBankCount(0),
 153     MaxPrivateElementSize(0),
 154
 155     FastFMAF32(false),
 156     HalfRate64Ops(false),
 157
 158     FP64FP16Denormals(false),
 159     DX10Clamp(false),
 160     FlatForGlobal(false),
 161     AutoWaitcntBeforeBarrier(false),
 162     CodeObjectV3(false),
 163     UnalignedScratchAccess(false),
 164     UnalignedBufferAccess(false),
 165
 166     HasApertureRegs(false),
 167     EnableXNACK(false),
 168     TrapHandler(false),
 169     DebuggerInsertNops(false),
 170     DebuggerEmitPrologue(false),
 171
 172     EnableHugePrivateBuffer(false),
 173     EnableVGPRSpilling(false),
 174     EnableLoadStoreOpt(false),
 175     EnableUnsafeDSOffsetFolding(false),
 176     EnableSIScheduler(false),
 177     EnableDS128(false),
 178     DumpCode(false),
 179
 180     FP64(false),
 181     GCN3Encoding(false),
 182     CIInsts(false),
 183     GFX9Insts(false),
 184     SGPRInitBug(false),
 185     HasSMemRealTime(false),
 186     HasIntClamp(false),
 187     HasFmaMixInsts(false),
 188     HasMovrel(false),
 189     HasVGPRIndexMode(false),
 190     HasScalarStores(false),
 191     HasScalarAtomics(false),
 192     HasInv2PiInlineImm(false),
 193     HasSDWAOmod(false),
 194     HasSDWAScalar(false),
 195     HasSDWASdst(false),
 196     HasSDWAMac(false),
 197     HasSDWAOutModsVOPC(false),
 198     HasDPP(false),
 199     HasDLInsts(false),
 200     D16PreservesUnusedBits(false),
 201     FlatAddressSpace(false),
 202     FlatInstOffsets(false),
 203     FlatGlobalInsts(false),
 204     FlatScratchInsts(false),
 205     AddNoCarryInsts(false),
 206     HasUnpackedD16VMem(false),
 207
 208     ScalarizeGlobal(false),
 209
 210     FeatureDisable(false),
 211     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
 212     TLInfo(TM, *this),
 213     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
 214   AS = AMDGPU::getAMDGPUAS(TT);
 215   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 216   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
 217   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
 218   InstSelector.reset(new AMDGPUInstructionSelector(
 219   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 220 }
 221
 222 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 223   const Function &F) const {
 224   if (NWaves == 1)
 225     return getLocalMemorySize();
 226   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 227   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 228   unsigned MaxWaves = getMaxWavesPerEU();
 229   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 230 }
 231
 232 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 233   const Function &F) const {
 234   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 235   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 236   unsigned MaxWaves = getMaxWavesPerEU();
 237   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 238   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 239   NumWaves = std::min(NumWaves, MaxWaves);
 240   NumWaves = std::max(NumWaves, 1u);
 241   return NumWaves;
 242 }
 243
 244 unsigned
 245 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 246   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 247   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 248 }
 249
 250 std::pair<unsigned, unsigned>
 251 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 252   switch (CC) {
 253   case CallingConv::AMDGPU_CS:
 254   case CallingConv::AMDGPU_KERNEL:
 255   case CallingConv::SPIR_KERNEL:
 256     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
 257   case CallingConv::AMDGPU_VS:
 258   case CallingConv::AMDGPU_LS:
 259   case CallingConv::AMDGPU_HS:
 260   case CallingConv::AMDGPU_ES:
 261   case CallingConv::AMDGPU_GS:
 262   case CallingConv::AMDGPU_PS:
 263     return std::make_pair(1, getWavefrontSize());
 264   default:
 265     return std::make_pair(1, 16 * getWavefrontSize());
 266   }
 267 }
 268
 269 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 270   const Function &F) const {
 271   // FIXME: 1024 if function.
 272   // Default minimum/maximum flat work group sizes.
 273   std::pair<unsigned, unsigned> Default =
 274     getDefaultFlatWorkGroupSize(F.getCallingConv());
 275
 276   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 277   // starts using "amdgpu-flat-work-group-size" attribute.
 278   Default.second = AMDGPU::getIntegerAttribute(
 279     F, "amdgpu-max-work-group-size", Default.second);
 280   Default.first = std::min(Default.first, Default.second);
 281
 282   // Requested minimum/maximum flat work group sizes.
 283   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 284     F, "amdgpu-flat-work-group-size", Default);
 285
 286   // Make sure requested minimum is less than requested maximum.
 287   if (Requested.first > Requested.second)
 288     return Default;
 289
 290   // Make sure requested values do not violate subtarget's specifications.
 291   if (Requested.first < getMinFlatWorkGroupSize())
 292     return Default;
 293   if (Requested.second > getMaxFlatWorkGroupSize())
 294     return Default;
 295
 296   return Requested;
 297 }
 298
 299 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 300   const Function &F) const {
 301   // Default minimum/maximum number of waves per execution unit.
 302   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 303
 304   // Default/requested minimum/maximum flat work group sizes.
 305   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 306
 307   // If minimum/maximum flat work group sizes were explicitly requested using
 308   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 309   // number of waves per execution unit to values implied by requested
 310   // minimum/maximum flat work group sizes.
 311   unsigned MinImpliedByFlatWorkGroupSize =
 312     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 313   bool RequestedFlatWorkGroupSize = false;
 314
 315   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 316   // starts using "amdgpu-flat-work-group-size" attribute.
 317   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
 318       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 319     Default.first = MinImpliedByFlatWorkGroupSize;
 320     RequestedFlatWorkGroupSize = true;
 321   }
 322
 323   // Requested minimum/maximum number of waves per execution unit.
 324   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 325     F, "amdgpu-waves-per-eu", Default, true);
 326
 327   // Make sure requested minimum is less than requested maximum.
 328   if (Requested.second && Requested.first > Requested.second)
 329     return Default;
 330
 331   // Make sure requested values do not violate subtarget's specifications.
 332   if (Requested.first < getMinWavesPerEU() ||
 333       Requested.first > getMaxWavesPerEU())
 334     return Default;
 335   if (Requested.second > getMaxWavesPerEU())
 336     return Default;
 337
 338   // Make sure requested values are compatible with values implied by requested
 339   // minimum/maximum flat work group sizes.
 340   if (RequestedFlatWorkGroupSize &&
 341       Requested.first < MinImpliedByFlatWorkGroupSize)
 342     return Default;
 343
 344   return Requested;
 345 }
 346
 347 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 348   Function *Kernel = I->getParent()->getParent();
 349   unsigned MinSize = 0;
 350   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 351   bool IdQuery = false;
 352
 353   // If reqd_work_group_size is present it narrows value down.
 354   if (auto *CI = dyn_cast<CallInst>(I)) {
 355     const Function *F = CI->getCalledFunction();
 356     if (F) {
 357       unsigned Dim = UINT_MAX;
 358       switch (F->getIntrinsicID()) {
 359       case Intrinsic::amdgcn_workitem_id_x:
 360       case Intrinsic::r600_read_tidig_x:
 361         IdQuery = true;
 362         LLVM_FALLTHROUGH;
 363       case Intrinsic::r600_read_local_size_x:
 364         Dim = 0;
 365         break;
 366       case Intrinsic::amdgcn_workitem_id_y:
 367       case Intrinsic::r600_read_tidig_y:
 368         IdQuery = true;
 369         LLVM_FALLTHROUGH;
 370       case Intrinsic::r600_read_local_size_y:
 371         Dim = 1;
 372         break;
 373       case Intrinsic::amdgcn_workitem_id_z:
 374       case Intrinsic::r600_read_tidig_z:
 375         IdQuery = true;
 376         LLVM_FALLTHROUGH;
 377       case Intrinsic::r600_read_local_size_z:
 378         Dim = 2;
 379         break;
 380       default:
 381         break;
 382       }
 383       if (Dim <= 3) {
 384         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 385           if (Node->getNumOperands() == 3)
 386             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 387                                   Node->getOperand(Dim))->getZExtValue();
 388       }
 389     }
 390   }
 391
 392   if (!MaxSize)
 393     return false;
 394
 395   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 396   // as Hi. For size query we need to pass Hi + 1.
 397   if (IdQuery)
 398     MinSize = 0;
 399   else
 400     ++MaxSize;
 401
 402   MDBuilder MDB(I->getContext());
 403   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 404                                                   APInt(32, MaxSize));
 405   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 406   return true;
 407 }
 408
 409 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 410                                                  unsigned &MaxAlign) const {
 411   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 412          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 413
 414   const DataLayout &DL = F.getParent()->getDataLayout();
 415   uint64_t ExplicitArgBytes = 0;
 416   MaxAlign = 1;
 417
 418   for (const Argument &Arg : F.args()) {
 419     Type *ArgTy = Arg.getType();
 420
 421     unsigned Align = DL.getABITypeAlignment(ArgTy);
 422     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 423     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
 424     MaxAlign = std::max(MaxAlign, Align);
 425   }
 426
 427   return ExplicitArgBytes;
 428 }
 429
 430 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 431                                                 unsigned &MaxAlign) const {
 432   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 433
 434   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
 435
 436   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 437   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 438   if (ImplicitBytes != 0) {
 439     unsigned Alignment = getAlignmentForImplicitArgPtr();
 440     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 441   }
 442
 443   // Being able to dereference past the end is useful for emitting scalar loads.
 444   return alignTo(TotalSize, 4);
 445 }
 446
 447 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 448                              const TargetMachine &TM) :
 449   R600GenSubtargetInfo(TT, GPU, FS),
 450   AMDGPUSubtarget(TT, getFeatureBits()),
 451   InstrInfo(*this),
 452   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 453   FMA(false),
 454   CaymanISA(false),
 455   CFALUBug(false),
 456   DX10Clamp(false),
 457   HasVertexCache(false),
 458   R600ALUInst(false),
 459   FP64(false),
 460   TexVTXClauseSize(0),
 461   Gen(R600),
 462   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
 463   InstrItins(getInstrItineraryForCPU(GPU)),
 464   AS (AMDGPU::getAMDGPUAS(TT)) { }
 465
 466 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 467                                       unsigned NumRegionInstrs) const {
 468   // Track register pressure so the scheduler can try to decrease
 469   // pressure once register usage is above the threshold defined by
 470   // SIRegisterInfo::getRegPressureSetLimit()
 471   Policy.ShouldTrackPressure = true;
 472
 473   // Enabling both top down and bottom up scheduling seems to give us less
 474   // register spills than just using one of these approaches on its own.
 475   Policy.OnlyTopDown = false;
 476   Policy.OnlyBottomUp = false;
 477
 478   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 479   if (!enableSIScheduler())
 480     Policy.ShouldTrackLaneMasks = true;
 481 }
 482
 483 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
 484   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 485 }
 486
 487 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 488   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 489     if (SGPRs <= 80)
 490       return 10;
 491     if (SGPRs <= 88)
 492       return 9;
 493     if (SGPRs <= 100)
 494       return 8;
 495     return 7;
 496   }
 497   if (SGPRs <= 48)
 498     return 10;
 499   if (SGPRs <= 56)
 500     return 9;
 501   if (SGPRs <= 64)
 502     return 8;
 503   if (SGPRs <= 72)
 504     return 7;
 505   if (SGPRs <= 80)
 506     return 6;
 507   return 5;
 508 }
 509
 510 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 511   if (VGPRs <= 24)
 512     return 10;
 513   if (VGPRs <= 28)
 514     return 9;
 515   if (VGPRs <= 32)
 516     return 8;
 517   if (VGPRs <= 36)
 518     return 7;
 519   if (VGPRs <= 40)
 520     return 6;
 521   if (VGPRs <= 48)
 522     return 5;
 523   if (VGPRs <= 64)
 524     return 4;
 525   if (VGPRs <= 84)
 526     return 3;
 527   if (VGPRs <= 128)
 528     return 2;
 529   return 1;
 530 }
 531
 532 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 533   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 534   if (MFI.hasFlatScratchInit()) {
 535     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 536       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 537     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 538       return 4; // FLAT_SCRATCH, VCC (in that order).
 539   }
 540
 541   if (isXNACKEnabled())
 542     return 4; // XNACK, VCC (in that order).
 543   return 2; // VCC.
 544 }
 545
 546 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 547   const Function &F = MF.getFunction();
 548   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 549
 550   // Compute maximum number of SGPRs function can use using default/requested
 551   // minimum number of waves per execution unit.
 552   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 553   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 554   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 555
 556   // Check if maximum number of SGPRs was explicitly requested using
 557   // "amdgpu-num-sgpr" attribute.
 558   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 559     unsigned Requested = AMDGPU::getIntegerAttribute(
 560       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 561
 562     // Make sure requested value does not violate subtarget's specifications.
 563     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 564       Requested = 0;
 565
 566     // If more SGPRs are required to support the input user/system SGPRs,
 567     // increase to accommodate them.
 568     //
 569     // FIXME: This really ends up using the requested number of SGPRs + number
 570     // of reserved special registers in total. Theoretically you could re-use
 571     // the last input registers for these special registers, but this would
 572     // require a lot of complexity to deal with the weird aliasing.
 573     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 574     if (Requested && Requested < InputNumSGPRs)
 575       Requested = InputNumSGPRs;
 576
 577     // Make sure requested value is compatible with values implied by
 578     // default/requested minimum/maximum number of waves per execution unit.
 579     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 580       Requested = 0;
 581     if (WavesPerEU.second &&
 582         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 583       Requested = 0;
 584
 585     if (Requested)
 586       MaxNumSGPRs = Requested;
 587   }
 588
 589   if (hasSGPRInitBug())
 590     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 591
 592   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 593                   MaxAddressableNumSGPRs);
 594 }
 595
 596 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 597   const Function &F = MF.getFunction();
 598   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 599
 600   // Compute maximum number of VGPRs function can use using default/requested
 601   // minimum number of waves per execution unit.
 602   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 603   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 604
 605   // Check if maximum number of VGPRs was explicitly requested using
 606   // "amdgpu-num-vgpr" attribute.
 607   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 608     unsigned Requested = AMDGPU::getIntegerAttribute(
 609       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 610
 611     // Make sure requested value is compatible with values implied by
 612     // default/requested minimum/maximum number of waves per execution unit.
 613     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 614       Requested = 0;
 615     if (WavesPerEU.second &&
 616         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 617       Requested = 0;
 618
 619     if (Requested)
 620       MaxNumVGPRs = Requested;
 621   }
 622
 623   return MaxNumVGPRs;
 624 }
 625
 626 namespace {
 627 struct MemOpClusterMutation : ScheduleDAGMutation {
 628   const SIInstrInfo *TII;
 629
 630   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
 631
 632   void apply(ScheduleDAGInstrs *DAGInstrs) override {
 633     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 634
 635     SUnit *SUa = nullptr;
 636     // Search for two consequent memory operations and link them
 637     // to prevent scheduler from moving them apart.
 638     // In DAG pre-process SUnits are in the original order of
 639     // the instructions before scheduling.
 640     for (SUnit &SU : DAG->SUnits) {
 641       MachineInstr &MI2 = *SU.getInstr();
 642       if (!MI2.mayLoad() && !MI2.mayStore()) {
 643         SUa = nullptr;
 644         continue;
 645       }
 646       if (!SUa) {
 647         SUa = &SU;
 648         continue;
 649       }
 650
 651       MachineInstr &MI1 = *SUa->getInstr();
 652       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
 653           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
 654           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
 655           (TII->isDS(MI1)   && TII->isDS(MI2))) {
 656         SU.addPredBarrier(SUa);
 657
 658         for (const SDep &SI : SU.Preds) {
 659           if (SI.getSUnit() != SUa)
 660             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
 661         }
 662
 663         if (&SU != &DAG->ExitSU) {
 664           for (const SDep &SI : SUa->Succs) {
 665             if (SI.getSUnit() != &SU)
 666               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
 667           }
 668         }
 669       }
 670
 671       SUa = &SU;
 672     }
 673   }
 674 };
 675 } // namespace
 676
 677 void GCNSubtarget::getPostRAMutations(
 678     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
 679   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
 680 }
 681
 682 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
 683   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
 684     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
 685   else
 686     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
 687 }
 688
 689 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
 690   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
 691     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
 692   else
 693     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
 694 }