contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPUSubtarget.h"
  15 #include "AMDGPU.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "AMDGPUCallLowering.h"
  18 #include "AMDGPUInstructionSelector.h"
  19 #include "AMDGPULegalizerInfo.h"
  20 #include "AMDGPURegisterBankInfo.h"
  21 #include "SIMachineFunctionInfo.h"
  22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  23 #include "llvm/ADT/SmallString.h"
  24 #include "llvm/CodeGen/MachineScheduler.h"
  25 #include "llvm/MC/MCSubtargetInfo.h"
  26 #include "llvm/IR/MDBuilder.h"
  27 #include "llvm/CodeGen/TargetFrameLowering.h"
  28 #include <algorithm>
  29
  30 using namespace llvm;
  31
  32 #define DEBUG_TYPE "amdgpu-subtarget"
  33
  34 #define GET_SUBTARGETINFO_TARGET_DESC
  35 #define GET_SUBTARGETINFO_CTOR
  36 #define AMDGPUSubtarget GCNSubtarget
  37 #include "AMDGPUGenSubtargetInfo.inc"
  38 #define GET_SUBTARGETINFO_TARGET_DESC
  39 #define GET_SUBTARGETINFO_CTOR
  40 #undef AMDGPUSubtarget
  41 #include "R600GenSubtargetInfo.inc"
  42
  43 static cl::opt<bool> DisablePowerSched(
  44   "amdgpu-disable-power-sched",
  45   cl::desc("Disable scheduling to minimize mAI power bursts"),
  46   cl::init(false));
  47
  48 GCNSubtarget::~GCNSubtarget() = default;
  49
  50 R600Subtarget &
  51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
  52                                                StringRef GPU, StringRef FS) {
  53   SmallString<256> FullFS("+promote-alloca,");
  54   FullFS += FS;
  55   ParseSubtargetFeatures(GPU, FullFS);
  56
  57   // FIXME: I don't think think Evergreen has any useful support for
  58   // denormals, but should be checked. Should we issue a warning somewhere
  59   // if someone tries to enable these?
  60   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  61     FP32Denormals = false;
  62   }
  63
  64   HasMulU24 = getGeneration() >= EVERGREEN;
  65   HasMulI24 = hasCaymanISA();
  66
  67   return *this;
  68 }
  69
  70 GCNSubtarget &
  71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
  72                                               StringRef GPU, StringRef FS) {
  73   // Determine default and user-specified characteristics
  74   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  75   // enabled, but some instructions do not respect them and they run at the
  76   // double precision rate, so don't enable by default.
  77   //
  78   // We want to be able to turn these off, but making this a subtarget feature
  79   // for SI has the unhelpful behavior that it unsets everything else if you
  80   // disable it.
  81   //
  82   // Similarly we want enable-prt-strict-null to be on by default and not to
  83   // unset everything else if it is disabled
  84
  85   // Assuming ECC is enabled is the conservative default.
  86   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
  87
  88   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  89     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  90
  91   // FIXME: I don't think think Evergreen has any useful support for
  92   // denormals, but should be checked. Should we issue a warning somewhere
  93   // if someone tries to enable these?
  94   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
  95     FullFS += "+fp64-fp16-denormals,";
  96   } else {
  97     FullFS += "-fp32-denormals,";
  98   }
  99
 100   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
 101
 102   // Disable mutually exclusive bits.
 103   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
 104     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
 105       FullFS += "-wavefrontsize16,";
 106     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
 107       FullFS += "-wavefrontsize32,";
 108     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
 109       FullFS += "-wavefrontsize64,";
 110   }
 111
 112   FullFS += FS;
 113
 114   ParseSubtargetFeatures(GPU, FullFS);
 115
 116   // We don't support FP64 for EG/NI atm.
 117   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 118
 119   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
 120   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
 121   // variants of MUBUF instructions.
 122   if (!hasAddr64() && !FS.contains("flat-for-global")) {
 123     FlatForGlobal = true;
 124   }
 125
 126   // Set defaults if needed.
 127   if (MaxPrivateElementSize == 0)
 128     MaxPrivateElementSize = 4;
 129
 130   if (LDSBankCount == 0)
 131     LDSBankCount = 32;
 132
 133   if (TT.getArch() == Triple::amdgcn) {
 134     if (LocalMemorySize == 0)
 135       LocalMemorySize = 32768;
 136
 137     // Do something sensible for unspecified target.
 138     if (!HasMovrel && !HasVGPRIndexMode)
 139       HasMovrel = true;
 140   }
 141
 142   // Don't crash on invalid devices.
 143   if (WavefrontSize == 0)
 144     WavefrontSize = 64;
 145
 146   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 147
 148   if (DoesNotSupportXNACK && EnableXNACK) {
 149     ToggleFeature(AMDGPU::FeatureXNACK);
 150     EnableXNACK = false;
 151   }
 152
 153   // ECC is on by default, but turn it off if the hardware doesn't support it
 154   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
 155   // ECC.
 156   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
 157     ToggleFeature(AMDGPU::FeatureSRAMECC);
 158     EnableSRAMECC = false;
 159   }
 160
 161   return *this;
 162 }
 163
 164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
 165   TargetTriple(TT),
 166   Has16BitInsts(false),
 167   HasMadMixInsts(false),
 168   FP32Denormals(false),
 169   FPExceptions(false),
 170   HasSDWA(false),
 171   HasVOP3PInsts(false),
 172   HasMulI24(true),
 173   HasMulU24(true),
 174   HasInv2PiInlineImm(false),
 175   HasFminFmaxLegacy(true),
 176   EnablePromoteAlloca(false),
 177   HasTrigReducedRange(false),
 178   LocalMemorySize(0),
 179   WavefrontSize(0)
 180   { }
 181
 182 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 183                            const GCNTargetMachine &TM) :
 184     AMDGPUGenSubtargetInfo(TT, GPU, FS),
 185     AMDGPUSubtarget(TT),
 186     TargetTriple(TT),
 187     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
 188     InstrItins(getInstrItineraryForCPU(GPU)),
 189     LDSBankCount(0),
 190     MaxPrivateElementSize(0),
 191
 192     FastFMAF32(false),
 193     HalfRate64Ops(false),
 194
 195     FP64FP16Denormals(false),
 196     FlatForGlobal(false),
 197     AutoWaitcntBeforeBarrier(false),
 198     CodeObjectV3(false),
 199     UnalignedScratchAccess(false),
 200     UnalignedBufferAccess(false),
 201
 202     HasApertureRegs(false),
 203     EnableXNACK(false),
 204     DoesNotSupportXNACK(false),
 205     EnableCuMode(false),
 206     TrapHandler(false),
 207
 208     EnableLoadStoreOpt(false),
 209     EnableUnsafeDSOffsetFolding(false),
 210     EnableSIScheduler(false),
 211     EnableDS128(false),
 212     EnablePRTStrictNull(false),
 213     DumpCode(false),
 214
 215     FP64(false),
 216     GCN3Encoding(false),
 217     CIInsts(false),
 218     GFX8Insts(false),
 219     GFX9Insts(false),
 220     GFX10Insts(false),
 221     GFX7GFX8GFX9Insts(false),
 222     SGPRInitBug(false),
 223     HasSMemRealTime(false),
 224     HasIntClamp(false),
 225     HasFmaMixInsts(false),
 226     HasMovrel(false),
 227     HasVGPRIndexMode(false),
 228     HasScalarStores(false),
 229     HasScalarAtomics(false),
 230     HasSDWAOmod(false),
 231     HasSDWAScalar(false),
 232     HasSDWASdst(false),
 233     HasSDWAMac(false),
 234     HasSDWAOutModsVOPC(false),
 235     HasDPP(false),
 236     HasDPP8(false),
 237     HasR128A16(false),
 238     HasNSAEncoding(false),
 239     HasDLInsts(false),
 240     HasDot1Insts(false),
 241     HasDot2Insts(false),
 242     HasDot3Insts(false),
 243     HasDot4Insts(false),
 244     HasDot5Insts(false),
 245     HasDot6Insts(false),
 246     HasMAIInsts(false),
 247     HasPkFmacF16Inst(false),
 248     HasAtomicFaddInsts(false),
 249     EnableSRAMECC(false),
 250     DoesNotSupportSRAMECC(false),
 251     HasNoSdstCMPX(false),
 252     HasVscnt(false),
 253     HasRegisterBanking(false),
 254     HasVOP3Literal(false),
 255     HasNoDataDepHazard(false),
 256     FlatAddressSpace(false),
 257     FlatInstOffsets(false),
 258     FlatGlobalInsts(false),
 259     FlatScratchInsts(false),
 260     ScalarFlatScratchInsts(false),
 261     AddNoCarryInsts(false),
 262     HasUnpackedD16VMem(false),
 263     LDSMisalignedBug(false),
 264
 265     ScalarizeGlobal(false),
 266
 267     HasVcmpxPermlaneHazard(false),
 268     HasVMEMtoScalarWriteHazard(false),
 269     HasSMEMtoVectorWriteHazard(false),
 270     HasInstFwdPrefetchBug(false),
 271     HasVcmpxExecWARHazard(false),
 272     HasLdsBranchVmemWARHazard(false),
 273     HasNSAtoVMEMBug(false),
 274     HasOffset3fBug(false),
 275     HasFlatSegmentOffsetBug(false),
 276
 277     FeatureDisable(false),
 278     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
 279     TLInfo(TM, *this),
 280     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
 281   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 282   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
 283   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
 284   InstSelector.reset(new AMDGPUInstructionSelector(
 285   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 286 }
 287
 288 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
 289   if (getGeneration() < GFX10)
 290     return 1;
 291
 292   switch (Opcode) {
 293   case AMDGPU::V_LSHLREV_B64:
 294   case AMDGPU::V_LSHLREV_B64_gfx10:
 295   case AMDGPU::V_LSHL_B64:
 296   case AMDGPU::V_LSHRREV_B64:
 297   case AMDGPU::V_LSHRREV_B64_gfx10:
 298   case AMDGPU::V_LSHR_B64:
 299   case AMDGPU::V_ASHRREV_I64:
 300   case AMDGPU::V_ASHRREV_I64_gfx10:
 301   case AMDGPU::V_ASHR_I64:
 302     return 1;
 303   }
 304
 305   return 2;
 306 }
 307
 308 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 309   const Function &F) const {
 310   if (NWaves == 1)
 311     return getLocalMemorySize();
 312   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 313   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 314   if (!WorkGroupsPerCu)
 315     return 0;
 316   unsigned MaxWaves = getMaxWavesPerEU();
 317   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 318 }
 319
 320 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 321   const Function &F) const {
 322   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 323   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 324   if (!WorkGroupsPerCu)
 325     return 0;
 326   unsigned MaxWaves = getMaxWavesPerEU();
 327   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 328   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 329   NumWaves = std::min(NumWaves, MaxWaves);
 330   NumWaves = std::max(NumWaves, 1u);
 331   return NumWaves;
 332 }
 333
 334 unsigned
 335 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 336   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 337   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 338 }
 339
 340 std::pair<unsigned, unsigned>
 341 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 342   switch (CC) {
 343   case CallingConv::AMDGPU_CS:
 344   case CallingConv::AMDGPU_KERNEL:
 345   case CallingConv::SPIR_KERNEL:
 346     return std::make_pair(getWavefrontSize() * 2,
 347                           std::max(getWavefrontSize() * 4, 256u));
 348   case CallingConv::AMDGPU_VS:
 349   case CallingConv::AMDGPU_LS:
 350   case CallingConv::AMDGPU_HS:
 351   case CallingConv::AMDGPU_ES:
 352   case CallingConv::AMDGPU_GS:
 353   case CallingConv::AMDGPU_PS:
 354     return std::make_pair(1, getWavefrontSize());
 355   default:
 356     return std::make_pair(1, 16 * getWavefrontSize());
 357   }
 358 }
 359
 360 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 361   const Function &F) const {
 362   // FIXME: 1024 if function.
 363   // Default minimum/maximum flat work group sizes.
 364   std::pair<unsigned, unsigned> Default =
 365     getDefaultFlatWorkGroupSize(F.getCallingConv());
 366
 367   // Requested minimum/maximum flat work group sizes.
 368   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 369     F, "amdgpu-flat-work-group-size", Default);
 370
 371   // Make sure requested minimum is less than requested maximum.
 372   if (Requested.first > Requested.second)
 373     return Default;
 374
 375   // Make sure requested values do not violate subtarget's specifications.
 376   if (Requested.first < getMinFlatWorkGroupSize())
 377     return Default;
 378   if (Requested.second > getMaxFlatWorkGroupSize())
 379     return Default;
 380
 381   return Requested;
 382 }
 383
 384 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 385   const Function &F) const {
 386   // Default minimum/maximum number of waves per execution unit.
 387   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 388
 389   // Default/requested minimum/maximum flat work group sizes.
 390   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 391
 392   // If minimum/maximum flat work group sizes were explicitly requested using
 393   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 394   // number of waves per execution unit to values implied by requested
 395   // minimum/maximum flat work group sizes.
 396   unsigned MinImpliedByFlatWorkGroupSize =
 397     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 398   bool RequestedFlatWorkGroupSize = false;
 399
 400   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 401     Default.first = MinImpliedByFlatWorkGroupSize;
 402     RequestedFlatWorkGroupSize = true;
 403   }
 404
 405   // Requested minimum/maximum number of waves per execution unit.
 406   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 407     F, "amdgpu-waves-per-eu", Default, true);
 408
 409   // Make sure requested minimum is less than requested maximum.
 410   if (Requested.second && Requested.first > Requested.second)
 411     return Default;
 412
 413   // Make sure requested values do not violate subtarget's specifications.
 414   if (Requested.first < getMinWavesPerEU() ||
 415       Requested.first > getMaxWavesPerEU())
 416     return Default;
 417   if (Requested.second > getMaxWavesPerEU())
 418     return Default;
 419
 420   // Make sure requested values are compatible with values implied by requested
 421   // minimum/maximum flat work group sizes.
 422   if (RequestedFlatWorkGroupSize &&
 423       Requested.first < MinImpliedByFlatWorkGroupSize)
 424     return Default;
 425
 426   return Requested;
 427 }
 428
 429 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 430   Function *Kernel = I->getParent()->getParent();
 431   unsigned MinSize = 0;
 432   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 433   bool IdQuery = false;
 434
 435   // If reqd_work_group_size is present it narrows value down.
 436   if (auto *CI = dyn_cast<CallInst>(I)) {
 437     const Function *F = CI->getCalledFunction();
 438     if (F) {
 439       unsigned Dim = UINT_MAX;
 440       switch (F->getIntrinsicID()) {
 441       case Intrinsic::amdgcn_workitem_id_x:
 442       case Intrinsic::r600_read_tidig_x:
 443         IdQuery = true;
 444         LLVM_FALLTHROUGH;
 445       case Intrinsic::r600_read_local_size_x:
 446         Dim = 0;
 447         break;
 448       case Intrinsic::amdgcn_workitem_id_y:
 449       case Intrinsic::r600_read_tidig_y:
 450         IdQuery = true;
 451         LLVM_FALLTHROUGH;
 452       case Intrinsic::r600_read_local_size_y:
 453         Dim = 1;
 454         break;
 455       case Intrinsic::amdgcn_workitem_id_z:
 456       case Intrinsic::r600_read_tidig_z:
 457         IdQuery = true;
 458         LLVM_FALLTHROUGH;
 459       case Intrinsic::r600_read_local_size_z:
 460         Dim = 2;
 461         break;
 462       default:
 463         break;
 464       }
 465       if (Dim <= 3) {
 466         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 467           if (Node->getNumOperands() == 3)
 468             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 469                                   Node->getOperand(Dim))->getZExtValue();
 470       }
 471     }
 472   }
 473
 474   if (!MaxSize)
 475     return false;
 476
 477   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 478   // as Hi. For size query we need to pass Hi + 1.
 479   if (IdQuery)
 480     MinSize = 0;
 481   else
 482     ++MaxSize;
 483
 484   MDBuilder MDB(I->getContext());
 485   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 486                                                   APInt(32, MaxSize));
 487   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 488   return true;
 489 }
 490
 491 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 492                                                  unsigned &MaxAlign) const {
 493   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 494          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 495
 496   const DataLayout &DL = F.getParent()->getDataLayout();
 497   uint64_t ExplicitArgBytes = 0;
 498   MaxAlign = 1;
 499
 500   for (const Argument &Arg : F.args()) {
 501     Type *ArgTy = Arg.getType();
 502
 503     unsigned Align = DL.getABITypeAlignment(ArgTy);
 504     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 505     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
 506     MaxAlign = std::max(MaxAlign, Align);
 507   }
 508
 509   return ExplicitArgBytes;
 510 }
 511
 512 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 513                                                 unsigned &MaxAlign) const {
 514   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 515
 516   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
 517
 518   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 519   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 520   if (ImplicitBytes != 0) {
 521     unsigned Alignment = getAlignmentForImplicitArgPtr();
 522     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 523   }
 524
 525   // Being able to dereference past the end is useful for emitting scalar loads.
 526   return alignTo(TotalSize, 4);
 527 }
 528
 529 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 530                              const TargetMachine &TM) :
 531   R600GenSubtargetInfo(TT, GPU, FS),
 532   AMDGPUSubtarget(TT),
 533   InstrInfo(*this),
 534   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 535   FMA(false),
 536   CaymanISA(false),
 537   CFALUBug(false),
 538   HasVertexCache(false),
 539   R600ALUInst(false),
 540   FP64(false),
 541   TexVTXClauseSize(0),
 542   Gen(R600),
 543   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
 544   InstrItins(getInstrItineraryForCPU(GPU)) { }
 545
 546 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 547                                       unsigned NumRegionInstrs) const {
 548   // Track register pressure so the scheduler can try to decrease
 549   // pressure once register usage is above the threshold defined by
 550   // SIRegisterInfo::getRegPressureSetLimit()
 551   Policy.ShouldTrackPressure = true;
 552
 553   // Enabling both top down and bottom up scheduling seems to give us less
 554   // register spills than just using one of these approaches on its own.
 555   Policy.OnlyTopDown = false;
 556   Policy.OnlyBottomUp = false;
 557
 558   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 559   if (!enableSIScheduler())
 560     Policy.ShouldTrackLaneMasks = true;
 561 }
 562
 563 bool GCNSubtarget::hasMadF16() const {
 564   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
 565 }
 566
 567 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 568   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 569     return 10;
 570
 571   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 572     if (SGPRs <= 80)
 573       return 10;
 574     if (SGPRs <= 88)
 575       return 9;
 576     if (SGPRs <= 100)
 577       return 8;
 578     return 7;
 579   }
 580   if (SGPRs <= 48)
 581     return 10;
 582   if (SGPRs <= 56)
 583     return 9;
 584   if (SGPRs <= 64)
 585     return 8;
 586   if (SGPRs <= 72)
 587     return 7;
 588   if (SGPRs <= 80)
 589     return 6;
 590   return 5;
 591 }
 592
 593 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 594   if (VGPRs <= 24)
 595     return 10;
 596   if (VGPRs <= 28)
 597     return 9;
 598   if (VGPRs <= 32)
 599     return 8;
 600   if (VGPRs <= 36)
 601     return 7;
 602   if (VGPRs <= 40)
 603     return 6;
 604   if (VGPRs <= 48)
 605     return 5;
 606   if (VGPRs <= 64)
 607     return 4;
 608   if (VGPRs <= 84)
 609     return 3;
 610   if (VGPRs <= 128)
 611     return 2;
 612   return 1;
 613 }
 614
 615 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 616   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 617   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 618     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
 619
 620   if (MFI.hasFlatScratchInit()) {
 621     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 622       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 623     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 624       return 4; // FLAT_SCRATCH, VCC (in that order).
 625   }
 626
 627   if (isXNACKEnabled())
 628     return 4; // XNACK, VCC (in that order).
 629   return 2; // VCC.
 630 }
 631
 632 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 633   const Function &F = MF.getFunction();
 634   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 635
 636   // Compute maximum number of SGPRs function can use using default/requested
 637   // minimum number of waves per execution unit.
 638   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 639   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 640   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 641
 642   // Check if maximum number of SGPRs was explicitly requested using
 643   // "amdgpu-num-sgpr" attribute.
 644   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 645     unsigned Requested = AMDGPU::getIntegerAttribute(
 646       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 647
 648     // Make sure requested value does not violate subtarget's specifications.
 649     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 650       Requested = 0;
 651
 652     // If more SGPRs are required to support the input user/system SGPRs,
 653     // increase to accommodate them.
 654     //
 655     // FIXME: This really ends up using the requested number of SGPRs + number
 656     // of reserved special registers in total. Theoretically you could re-use
 657     // the last input registers for these special registers, but this would
 658     // require a lot of complexity to deal with the weird aliasing.
 659     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 660     if (Requested && Requested < InputNumSGPRs)
 661       Requested = InputNumSGPRs;
 662
 663     // Make sure requested value is compatible with values implied by
 664     // default/requested minimum/maximum number of waves per execution unit.
 665     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 666       Requested = 0;
 667     if (WavesPerEU.second &&
 668         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 669       Requested = 0;
 670
 671     if (Requested)
 672       MaxNumSGPRs = Requested;
 673   }
 674
 675   if (hasSGPRInitBug())
 676     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 677
 678   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 679                   MaxAddressableNumSGPRs);
 680 }
 681
 682 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 683   const Function &F = MF.getFunction();
 684   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 685
 686   // Compute maximum number of VGPRs function can use using default/requested
 687   // minimum number of waves per execution unit.
 688   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 689   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 690
 691   // Check if maximum number of VGPRs was explicitly requested using
 692   // "amdgpu-num-vgpr" attribute.
 693   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 694     unsigned Requested = AMDGPU::getIntegerAttribute(
 695       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 696
 697     // Make sure requested value is compatible with values implied by
 698     // default/requested minimum/maximum number of waves per execution unit.
 699     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 700       Requested = 0;
 701     if (WavesPerEU.second &&
 702         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 703       Requested = 0;
 704
 705     if (Requested)
 706       MaxNumVGPRs = Requested;
 707   }
 708
 709   return MaxNumVGPRs;
 710 }
 711
 712 namespace {
 713 struct MemOpClusterMutation : ScheduleDAGMutation {
 714   const SIInstrInfo *TII;
 715
 716   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
 717
 718   void apply(ScheduleDAGInstrs *DAG) override {
 719     SUnit *SUa = nullptr;
 720     // Search for two consequent memory operations and link them
 721     // to prevent scheduler from moving them apart.
 722     // In DAG pre-process SUnits are in the original order of
 723     // the instructions before scheduling.
 724     for (SUnit &SU : DAG->SUnits) {
 725       MachineInstr &MI2 = *SU.getInstr();
 726       if (!MI2.mayLoad() && !MI2.mayStore()) {
 727         SUa = nullptr;
 728         continue;
 729       }
 730       if (!SUa) {
 731         SUa = &SU;
 732         continue;
 733       }
 734
 735       MachineInstr &MI1 = *SUa->getInstr();
 736       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
 737           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
 738           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
 739           (TII->isDS(MI1)   && TII->isDS(MI2))) {
 740         SU.addPredBarrier(SUa);
 741
 742         for (const SDep &SI : SU.Preds) {
 743           if (SI.getSUnit() != SUa)
 744             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
 745         }
 746
 747         if (&SU != &DAG->ExitSU) {
 748           for (const SDep &SI : SUa->Succs) {
 749             if (SI.getSUnit() != &SU)
 750               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
 751           }
 752         }
 753       }
 754
 755       SUa = &SU;
 756     }
 757   }
 758 };
 759
 760 struct FillMFMAShadowMutation : ScheduleDAGMutation {
 761   const SIInstrInfo *TII;
 762
 763   ScheduleDAGMI *DAG;
 764
 765   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
 766
 767   bool isSALU(const SUnit *SU) const {
 768     const MachineInstr *MI = SU->getInstr();
 769     return MI && TII->isSALU(*MI) && !MI->isTerminator();
 770   }
 771
 772   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
 773     if (Pred->NodeNum < Succ->NodeNum)
 774       return true;
 775
 776     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
 777
 778     for (unsigned I = 0; I < Succs.size(); ++I) {
 779       for (const SDep &SI : Succs[I]->Succs) {
 780         const SUnit *SU = SI.getSUnit();
 781         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
 782           Succs.push_back(SU);
 783       }
 784     }
 785
 786     SmallPtrSet<const SUnit*, 32> Visited;
 787     while (!Preds.empty()) {
 788       const SUnit *SU = Preds.pop_back_val();
 789       if (llvm::find(Succs, SU) != Succs.end())
 790         return false;
 791       Visited.insert(SU);
 792       for (const SDep &SI : SU->Preds)
 793         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
 794           Preds.push_back(SI.getSUnit());
 795     }
 796
 797     return true;
 798   }
 799
 800   // Link as much SALU intructions in chain as possible. Return the size
 801   // of the chain. Links up to MaxChain instructions.
 802   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
 803                          SmallPtrSetImpl<SUnit *> &Visited) const {
 804     SmallVector<SUnit *, 8> Worklist({To});
 805     unsigned Linked = 0;
 806
 807     while (!Worklist.empty() && MaxChain-- > 0) {
 808       SUnit *SU = Worklist.pop_back_val();
 809       if (!Visited.insert(SU).second)
 810         continue;
 811
 812       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
 813                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
 814
 815       if (SU->addPred(SDep(From, SDep::Artificial), false))
 816         ++Linked;
 817
 818       for (SDep &SI : From->Succs) {
 819         SUnit *SUv = SI.getSUnit();
 820         if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
 821           SUv->addPred(SDep(SU, SDep::Artificial), false);
 822       }
 823
 824       for (SDep &SI : SU->Succs) {
 825         SUnit *Succ = SI.getSUnit();
 826         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
 827           Worklist.push_back(Succ);
 828       }
 829     }
 830
 831     return Linked;
 832   }
 833
 834   void apply(ScheduleDAGInstrs *DAGInstrs) override {
 835     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
 836     if (!ST.hasMAIInsts() || DisablePowerSched)
 837       return;
 838     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 839     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
 840     if (!TSchedModel || DAG->SUnits.empty())
 841       return;
 842
 843     // Scan for MFMA long latency instructions and try to add a dependency
 844     // of available SALU instructions to give them a chance to fill MFMA
 845     // shadow. That is desirable to fill MFMA shadow with SALU instructions
 846     // rather than VALU to prevent power consumption bursts and throttle.
 847     auto LastSALU = DAG->SUnits.begin();
 848     auto E = DAG->SUnits.end();
 849     SmallPtrSet<SUnit*, 32> Visited;
 850     for (SUnit &SU : DAG->SUnits) {
 851       MachineInstr &MAI = *SU.getInstr();
 852       if (!TII->isMAI(MAI) ||
 853            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
 854            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
 855         continue;
 856
 857       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
 858
 859       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
 860                  dbgs() << "Need " << Lat
 861                         << " instructions to cover latency.\n");
 862
 863       // Find up to Lat independent scalar instructions as early as
 864       // possible such that they can be scheduled after this MFMA.
 865       for ( ; Lat && LastSALU != E; ++LastSALU) {
 866         if (Visited.count(&*LastSALU))
 867           continue;
 868
 869         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
 870           continue;
 871
 872         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
 873       }
 874     }
 875   }
 876 };
 877 } // namespace
 878
 879 void GCNSubtarget::getPostRAMutations(
 880     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
 881   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
 882   Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 883 }
 884
 885 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
 886   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
 887     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
 888   else
 889     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
 890 }
 891
 892 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
 893   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
 894     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
 895   else
 896     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
 897 }