contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPUSubtarget.h"
  15 #include "AMDGPU.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "AMDGPUCallLowering.h"
  18 #include "AMDGPUInstructionSelector.h"
  19 #include "AMDGPULegalizerInfo.h"
  20 #include "AMDGPURegisterBankInfo.h"
  21 #include "SIMachineFunctionInfo.h"
  22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  23 #include "llvm/ADT/SmallString.h"
  24 #include "llvm/CodeGen/MachineScheduler.h"
  25 #include "llvm/MC/MCSubtargetInfo.h"
  26 #include "llvm/IR/MDBuilder.h"
  27 #include "llvm/CodeGen/TargetFrameLowering.h"
  28 #include <algorithm>
  29
  30 using namespace llvm;
  31
  32 #define DEBUG_TYPE "amdgpu-subtarget"
  33
  34 #define GET_SUBTARGETINFO_TARGET_DESC
  35 #define GET_SUBTARGETINFO_CTOR
  36 #define AMDGPUSubtarget GCNSubtarget
  37 #include "AMDGPUGenSubtargetInfo.inc"
  38 #define GET_SUBTARGETINFO_TARGET_DESC
  39 #define GET_SUBTARGETINFO_CTOR
  40 #undef AMDGPUSubtarget
  41 #include "R600GenSubtargetInfo.inc"
  42
  43 static cl::opt<bool> DisablePowerSched(
  44   "amdgpu-disable-power-sched",
  45   cl::desc("Disable scheduling to minimize mAI power bursts"),
  46   cl::init(false));
  47
  48 static cl::opt<bool> EnableVGPRIndexMode(
  49   "amdgpu-vgpr-index-mode",
  50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
  51   cl::init(false));
  52
  53 GCNSubtarget::~GCNSubtarget() = default;
  54
  55 R600Subtarget &
  56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
  57                                                StringRef GPU, StringRef FS) {
  58   SmallString<256> FullFS("+promote-alloca,");
  59   FullFS += FS;
  60   ParseSubtargetFeatures(GPU, FullFS);
  61
  62   HasMulU24 = getGeneration() >= EVERGREEN;
  63   HasMulI24 = hasCaymanISA();
  64
  65   return *this;
  66 }
  67
  68 GCNSubtarget &
  69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
  70                                               StringRef GPU, StringRef FS) {
  71   // Determine default and user-specified characteristics
  72   //
  73   // We want to be able to turn these off, but making this a subtarget feature
  74   // for SI has the unhelpful behavior that it unsets everything else if you
  75   // disable it.
  76   //
  77   // Similarly we want enable-prt-strict-null to be on by default and not to
  78   // unset everything else if it is disabled
  79
  80   // Assuming ECC is enabled is the conservative default.
  81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
  82
  83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  85
  86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
  87
  88   // Disable mutually exclusive bits.
  89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
  90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
  91       FullFS += "-wavefrontsize16,";
  92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
  93       FullFS += "-wavefrontsize32,";
  94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
  95       FullFS += "-wavefrontsize64,";
  96   }
  97
  98   FullFS += FS;
  99
 100   ParseSubtargetFeatures(GPU, FullFS);
 101
 102   // We don't support FP64 for EG/NI atm.
 103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 104
 105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
 106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
 107   // variants of MUBUF instructions.
 108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
 109     FlatForGlobal = true;
 110   }
 111
 112   // Set defaults if needed.
 113   if (MaxPrivateElementSize == 0)
 114     MaxPrivateElementSize = 4;
 115
 116   if (LDSBankCount == 0)
 117     LDSBankCount = 32;
 118
 119   if (TT.getArch() == Triple::amdgcn) {
 120     if (LocalMemorySize == 0)
 121       LocalMemorySize = 32768;
 122
 123     // Do something sensible for unspecified target.
 124     if (!HasMovrel && !HasVGPRIndexMode)
 125       HasMovrel = true;
 126   }
 127
 128   // Don't crash on invalid devices.
 129   if (WavefrontSizeLog2 == 0)
 130     WavefrontSizeLog2 = 5;
 131
 132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 133
 134   // Disable XNACK on targets where it is not enabled by default unless it is
 135   // explicitly requested.
 136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
 137     ToggleFeature(AMDGPU::FeatureXNACK);
 138     EnableXNACK = false;
 139   }
 140
 141   // ECC is on by default, but turn it off if the hardware doesn't support it
 142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
 143   // ECC.
 144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
 145     ToggleFeature(AMDGPU::FeatureSRAMECC);
 146     EnableSRAMECC = false;
 147   }
 148
 149   return *this;
 150 }
 151
 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
 153   TargetTriple(TT),
 154   Has16BitInsts(false),
 155   HasMadMixInsts(false),
 156   HasMadMacF32Insts(false),
 157   HasDsSrc2Insts(false),
 158   HasSDWA(false),
 159   HasVOP3PInsts(false),
 160   HasMulI24(true),
 161   HasMulU24(true),
 162   HasInv2PiInlineImm(false),
 163   HasFminFmaxLegacy(true),
 164   EnablePromoteAlloca(false),
 165   HasTrigReducedRange(false),
 166   MaxWavesPerEU(10),
 167   LocalMemorySize(0),
 168   WavefrontSizeLog2(0)
 169   { }
 170
 171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 172                            const GCNTargetMachine &TM) :
 173     AMDGPUGenSubtargetInfo(TT, GPU, FS),
 174     AMDGPUSubtarget(TT),
 175     TargetTriple(TT),
 176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
 177     InstrItins(getInstrItineraryForCPU(GPU)),
 178     LDSBankCount(0),
 179     MaxPrivateElementSize(0),
 180
 181     FastFMAF32(false),
 182     FastDenormalF32(false),
 183     HalfRate64Ops(false),
 184
 185     FlatForGlobal(false),
 186     AutoWaitcntBeforeBarrier(false),
 187     CodeObjectV3(false),
 188     UnalignedScratchAccess(false),
 189     UnalignedBufferAccess(false),
 190
 191     HasApertureRegs(false),
 192     EnableXNACK(false),
 193     DoesNotSupportXNACK(false),
 194     EnableCuMode(false),
 195     TrapHandler(false),
 196
 197     EnableLoadStoreOpt(false),
 198     EnableUnsafeDSOffsetFolding(false),
 199     EnableSIScheduler(false),
 200     EnableDS128(false),
 201     EnablePRTStrictNull(false),
 202     DumpCode(false),
 203
 204     FP64(false),
 205     GCN3Encoding(false),
 206     CIInsts(false),
 207     GFX8Insts(false),
 208     GFX9Insts(false),
 209     GFX10Insts(false),
 210     GFX10_3Insts(false),
 211     GFX7GFX8GFX9Insts(false),
 212     SGPRInitBug(false),
 213     HasSMemRealTime(false),
 214     HasIntClamp(false),
 215     HasFmaMixInsts(false),
 216     HasMovrel(false),
 217     HasVGPRIndexMode(false),
 218     HasScalarStores(false),
 219     HasScalarAtomics(false),
 220     HasSDWAOmod(false),
 221     HasSDWAScalar(false),
 222     HasSDWASdst(false),
 223     HasSDWAMac(false),
 224     HasSDWAOutModsVOPC(false),
 225     HasDPP(false),
 226     HasDPP8(false),
 227     HasR128A16(false),
 228     HasGFX10A16(false),
 229     HasG16(false),
 230     HasNSAEncoding(false),
 231     GFX10_BEncoding(false),
 232     HasDLInsts(false),
 233     HasDot1Insts(false),
 234     HasDot2Insts(false),
 235     HasDot3Insts(false),
 236     HasDot4Insts(false),
 237     HasDot5Insts(false),
 238     HasDot6Insts(false),
 239     HasMAIInsts(false),
 240     HasPkFmacF16Inst(false),
 241     HasAtomicFaddInsts(false),
 242     EnableSRAMECC(false),
 243     DoesNotSupportSRAMECC(false),
 244     HasNoSdstCMPX(false),
 245     HasVscnt(false),
 246     HasGetWaveIdInst(false),
 247     HasSMemTimeInst(false),
 248     HasRegisterBanking(false),
 249     HasVOP3Literal(false),
 250     HasNoDataDepHazard(false),
 251     FlatAddressSpace(false),
 252     FlatInstOffsets(false),
 253     FlatGlobalInsts(false),
 254     FlatScratchInsts(false),
 255     ScalarFlatScratchInsts(false),
 256     AddNoCarryInsts(false),
 257     HasUnpackedD16VMem(false),
 258     LDSMisalignedBug(false),
 259     HasMFMAInlineLiteralBug(false),
 260
 261     ScalarizeGlobal(false),
 262
 263     HasVcmpxPermlaneHazard(false),
 264     HasVMEMtoScalarWriteHazard(false),
 265     HasSMEMtoVectorWriteHazard(false),
 266     HasInstFwdPrefetchBug(false),
 267     HasVcmpxExecWARHazard(false),
 268     HasLdsBranchVmemWARHazard(false),
 269     HasNSAtoVMEMBug(false),
 270     HasOffset3fBug(false),
 271     HasFlatSegmentOffsetBug(false),
 272
 273     FeatureDisable(false),
 274     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
 275     TLInfo(TM, *this),
 276     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
 277   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
 278   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 279   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
 280   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
 281   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
 282   InstSelector.reset(new AMDGPUInstructionSelector(
 283   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 284 }
 285
 286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
 287   if (getGeneration() < GFX10)
 288     return 1;
 289
 290   switch (Opcode) {
 291   case AMDGPU::V_LSHLREV_B64:
 292   case AMDGPU::V_LSHLREV_B64_gfx10:
 293   case AMDGPU::V_LSHL_B64:
 294   case AMDGPU::V_LSHRREV_B64:
 295   case AMDGPU::V_LSHRREV_B64_gfx10:
 296   case AMDGPU::V_LSHR_B64:
 297   case AMDGPU::V_ASHRREV_I64:
 298   case AMDGPU::V_ASHRREV_I64_gfx10:
 299   case AMDGPU::V_ASHR_I64:
 300     return 1;
 301   }
 302
 303   return 2;
 304 }
 305
 306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 307   const Function &F) const {
 308   if (NWaves == 1)
 309     return getLocalMemorySize();
 310   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 311   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 312   if (!WorkGroupsPerCu)
 313     return 0;
 314   unsigned MaxWaves = getMaxWavesPerEU();
 315   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 316 }
 317
 318 // FIXME: Should return min,max range.
 319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 320   const Function &F) const {
 321   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
 322   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
 323   if (!MaxWorkGroupsPerCu)
 324     return 0;
 325
 326   const unsigned WaveSize = getWavefrontSize();
 327
 328   // FIXME: Do we need to account for alignment requirement of LDS rounding the
 329   // size up?
 330   // Compute restriction based on LDS usage
 331   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
 332
 333   // This can be queried with more LDS than is possible, so just assume the
 334   // worst.
 335   if (NumGroups == 0)
 336     return 1;
 337
 338   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
 339
 340   // Round to the number of waves.
 341   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
 342   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
 343
 344   // Clamp to the maximum possible number of waves.
 345   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
 346
 347   // FIXME: Needs to be a multiple of the group size?
 348   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
 349
 350   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
 351          "computed invalid occupancy");
 352   return MaxWaves;
 353 }
 354
 355 unsigned
 356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 357   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 358   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 359 }
 360
 361 std::pair<unsigned, unsigned>
 362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 363   switch (CC) {
 364   case CallingConv::AMDGPU_VS:
 365   case CallingConv::AMDGPU_LS:
 366   case CallingConv::AMDGPU_HS:
 367   case CallingConv::AMDGPU_ES:
 368   case CallingConv::AMDGPU_GS:
 369   case CallingConv::AMDGPU_PS:
 370     return std::make_pair(1, getWavefrontSize());
 371   default:
 372     return std::make_pair(1u, getMaxFlatWorkGroupSize());
 373   }
 374 }
 375
 376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 377   const Function &F) const {
 378   // Default minimum/maximum flat work group sizes.
 379   std::pair<unsigned, unsigned> Default =
 380     getDefaultFlatWorkGroupSize(F.getCallingConv());
 381
 382   // Requested minimum/maximum flat work group sizes.
 383   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 384     F, "amdgpu-flat-work-group-size", Default);
 385
 386   // Make sure requested minimum is less than requested maximum.
 387   if (Requested.first > Requested.second)
 388     return Default;
 389
 390   // Make sure requested values do not violate subtarget's specifications.
 391   if (Requested.first < getMinFlatWorkGroupSize())
 392     return Default;
 393   if (Requested.second > getMaxFlatWorkGroupSize())
 394     return Default;
 395
 396   return Requested;
 397 }
 398
 399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 400   const Function &F) const {
 401   // Default minimum/maximum number of waves per execution unit.
 402   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 403
 404   // Default/requested minimum/maximum flat work group sizes.
 405   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 406
 407   // If minimum/maximum flat work group sizes were explicitly requested using
 408   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 409   // number of waves per execution unit to values implied by requested
 410   // minimum/maximum flat work group sizes.
 411   unsigned MinImpliedByFlatWorkGroupSize =
 412     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
 413   Default.first = MinImpliedByFlatWorkGroupSize;
 414   bool RequestedFlatWorkGroupSize =
 415       F.hasFnAttribute("amdgpu-flat-work-group-size");
 416
 417   // Requested minimum/maximum number of waves per execution unit.
 418   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 419     F, "amdgpu-waves-per-eu", Default, true);
 420
 421   // Make sure requested minimum is less than requested maximum.
 422   if (Requested.second && Requested.first > Requested.second)
 423     return Default;
 424
 425   // Make sure requested values do not violate subtarget's specifications.
 426   if (Requested.first < getMinWavesPerEU() ||
 427       Requested.second > getMaxWavesPerEU())
 428     return Default;
 429
 430   // Make sure requested values are compatible with values implied by requested
 431   // minimum/maximum flat work group sizes.
 432   if (RequestedFlatWorkGroupSize &&
 433       Requested.first < MinImpliedByFlatWorkGroupSize)
 434     return Default;
 435
 436   return Requested;
 437 }
 438
 439 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 440   Function *Kernel = I->getParent()->getParent();
 441   unsigned MinSize = 0;
 442   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 443   bool IdQuery = false;
 444
 445   // If reqd_work_group_size is present it narrows value down.
 446   if (auto *CI = dyn_cast<CallInst>(I)) {
 447     const Function *F = CI->getCalledFunction();
 448     if (F) {
 449       unsigned Dim = UINT_MAX;
 450       switch (F->getIntrinsicID()) {
 451       case Intrinsic::amdgcn_workitem_id_x:
 452       case Intrinsic::r600_read_tidig_x:
 453         IdQuery = true;
 454         LLVM_FALLTHROUGH;
 455       case Intrinsic::r600_read_local_size_x:
 456         Dim = 0;
 457         break;
 458       case Intrinsic::amdgcn_workitem_id_y:
 459       case Intrinsic::r600_read_tidig_y:
 460         IdQuery = true;
 461         LLVM_FALLTHROUGH;
 462       case Intrinsic::r600_read_local_size_y:
 463         Dim = 1;
 464         break;
 465       case Intrinsic::amdgcn_workitem_id_z:
 466       case Intrinsic::r600_read_tidig_z:
 467         IdQuery = true;
 468         LLVM_FALLTHROUGH;
 469       case Intrinsic::r600_read_local_size_z:
 470         Dim = 2;
 471         break;
 472       default:
 473         break;
 474       }
 475       if (Dim <= 3) {
 476         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 477           if (Node->getNumOperands() == 3)
 478             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 479                                   Node->getOperand(Dim))->getZExtValue();
 480       }
 481     }
 482   }
 483
 484   if (!MaxSize)
 485     return false;
 486
 487   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 488   // as Hi. For size query we need to pass Hi + 1.
 489   if (IdQuery)
 490     MinSize = 0;
 491   else
 492     ++MaxSize;
 493
 494   MDBuilder MDB(I->getContext());
 495   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 496                                                   APInt(32, MaxSize));
 497   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 498   return true;
 499 }
 500
 501 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 502                                                  Align &MaxAlign) const {
 503   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 504          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 505
 506   const DataLayout &DL = F.getParent()->getDataLayout();
 507   uint64_t ExplicitArgBytes = 0;
 508   MaxAlign = Align(1);
 509
 510   for (const Argument &Arg : F.args()) {
 511     Type *ArgTy = Arg.getType();
 512
 513     const Align Alignment = DL.getABITypeAlign(ArgTy);
 514     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 515     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
 516     MaxAlign = std::max(MaxAlign, Alignment);
 517   }
 518
 519   return ExplicitArgBytes;
 520 }
 521
 522 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 523                                                 Align &MaxAlign) const {
 524   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 525
 526   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
 527
 528   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 529   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 530   if (ImplicitBytes != 0) {
 531     const Align Alignment = getAlignmentForImplicitArgPtr();
 532     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 533   }
 534
 535   // Being able to dereference past the end is useful for emitting scalar loads.
 536   return alignTo(TotalSize, 4);
 537 }
 538
 539 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 540                              const TargetMachine &TM) :
 541   R600GenSubtargetInfo(TT, GPU, FS),
 542   AMDGPUSubtarget(TT),
 543   InstrInfo(*this),
 544   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 545   FMA(false),
 546   CaymanISA(false),
 547   CFALUBug(false),
 548   HasVertexCache(false),
 549   R600ALUInst(false),
 550   FP64(false),
 551   TexVTXClauseSize(0),
 552   Gen(R600),
 553   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
 554   InstrItins(getInstrItineraryForCPU(GPU)) { }
 555
 556 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 557                                       unsigned NumRegionInstrs) const {
 558   // Track register pressure so the scheduler can try to decrease
 559   // pressure once register usage is above the threshold defined by
 560   // SIRegisterInfo::getRegPressureSetLimit()
 561   Policy.ShouldTrackPressure = true;
 562
 563   // Enabling both top down and bottom up scheduling seems to give us less
 564   // register spills than just using one of these approaches on its own.
 565   Policy.OnlyTopDown = false;
 566   Policy.OnlyBottomUp = false;
 567
 568   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 569   if (!enableSIScheduler())
 570     Policy.ShouldTrackLaneMasks = true;
 571 }
 572
 573 bool GCNSubtarget::hasMadF16() const {
 574   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
 575 }
 576
 577 bool GCNSubtarget::useVGPRIndexMode() const {
 578   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
 579 }
 580
 581 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 582   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 583     return getMaxWavesPerEU();
 584
 585   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 586     if (SGPRs <= 80)
 587       return 10;
 588     if (SGPRs <= 88)
 589       return 9;
 590     if (SGPRs <= 100)
 591       return 8;
 592     return 7;
 593   }
 594   if (SGPRs <= 48)
 595     return 10;
 596   if (SGPRs <= 56)
 597     return 9;
 598   if (SGPRs <= 64)
 599     return 8;
 600   if (SGPRs <= 72)
 601     return 7;
 602   if (SGPRs <= 80)
 603     return 6;
 604   return 5;
 605 }
 606
 607 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 608   unsigned MaxWaves = getMaxWavesPerEU();
 609   unsigned Granule = getVGPRAllocGranule();
 610   if (VGPRs < Granule)
 611     return MaxWaves;
 612   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
 613   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
 614 }
 615
 616 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 617   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 618   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 619     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
 620
 621   if (MFI.hasFlatScratchInit()) {
 622     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 623       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 624     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 625       return 4; // FLAT_SCRATCH, VCC (in that order).
 626   }
 627
 628   if (isXNACKEnabled())
 629     return 4; // XNACK, VCC (in that order).
 630   return 2; // VCC.
 631 }
 632
 633 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
 634                                         unsigned NumSGPRs,
 635                                         unsigned NumVGPRs) const {
 636   unsigned Occupancy =
 637     std::min(getMaxWavesPerEU(),
 638              getOccupancyWithLocalMemSize(LDSSize, F));
 639   if (NumSGPRs)
 640     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
 641   if (NumVGPRs)
 642     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
 643   return Occupancy;
 644 }
 645
 646 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 647   const Function &F = MF.getFunction();
 648   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 649
 650   // Compute maximum number of SGPRs function can use using default/requested
 651   // minimum number of waves per execution unit.
 652   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 653   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 654   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 655
 656   // Check if maximum number of SGPRs was explicitly requested using
 657   // "amdgpu-num-sgpr" attribute.
 658   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 659     unsigned Requested = AMDGPU::getIntegerAttribute(
 660       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 661
 662     // Make sure requested value does not violate subtarget's specifications.
 663     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 664       Requested = 0;
 665
 666     // If more SGPRs are required to support the input user/system SGPRs,
 667     // increase to accommodate them.
 668     //
 669     // FIXME: This really ends up using the requested number of SGPRs + number
 670     // of reserved special registers in total. Theoretically you could re-use
 671     // the last input registers for these special registers, but this would
 672     // require a lot of complexity to deal with the weird aliasing.
 673     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 674     if (Requested && Requested < InputNumSGPRs)
 675       Requested = InputNumSGPRs;
 676
 677     // Make sure requested value is compatible with values implied by
 678     // default/requested minimum/maximum number of waves per execution unit.
 679     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 680       Requested = 0;
 681     if (WavesPerEU.second &&
 682         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 683       Requested = 0;
 684
 685     if (Requested)
 686       MaxNumSGPRs = Requested;
 687   }
 688
 689   if (hasSGPRInitBug())
 690     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 691
 692   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 693                   MaxAddressableNumSGPRs);
 694 }
 695
 696 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 697   const Function &F = MF.getFunction();
 698   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 699
 700   // Compute maximum number of VGPRs function can use using default/requested
 701   // minimum number of waves per execution unit.
 702   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 703   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 704
 705   // Check if maximum number of VGPRs was explicitly requested using
 706   // "amdgpu-num-vgpr" attribute.
 707   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 708     unsigned Requested = AMDGPU::getIntegerAttribute(
 709       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 710
 711     // Make sure requested value is compatible with values implied by
 712     // default/requested minimum/maximum number of waves per execution unit.
 713     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 714       Requested = 0;
 715     if (WavesPerEU.second &&
 716         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 717       Requested = 0;
 718
 719     if (Requested)
 720       MaxNumVGPRs = Requested;
 721   }
 722
 723   return MaxNumVGPRs;
 724 }
 725
 726 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
 727                                          int UseOpIdx, SDep &Dep) const {
 728   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
 729       !Def->isInstr() || !Use->isInstr())
 730     return;
 731
 732   MachineInstr *DefI = Def->getInstr();
 733   MachineInstr *UseI = Use->getInstr();
 734
 735   if (DefI->isBundle()) {
 736     const SIRegisterInfo *TRI = getRegisterInfo();
 737     auto Reg = Dep.getReg();
 738     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
 739     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
 740     unsigned Lat = 0;
 741     for (++I; I != E && I->isBundledWithPred(); ++I) {
 742       if (I->modifiesRegister(Reg, TRI))
 743         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
 744       else if (Lat)
 745         --Lat;
 746     }
 747     Dep.setLatency(Lat);
 748   } else if (UseI->isBundle()) {
 749     const SIRegisterInfo *TRI = getRegisterInfo();
 750     auto Reg = Dep.getReg();
 751     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
 752     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
 753     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
 754     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
 755       if (I->readsRegister(Reg, TRI))
 756         break;
 757       --Lat;
 758     }
 759     Dep.setLatency(Lat);
 760   }
 761 }
 762
 763 namespace {
 764 struct FillMFMAShadowMutation : ScheduleDAGMutation {
 765   const SIInstrInfo *TII;
 766
 767   ScheduleDAGMI *DAG;
 768
 769   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
 770
 771   bool isSALU(const SUnit *SU) const {
 772     const MachineInstr *MI = SU->getInstr();
 773     return MI && TII->isSALU(*MI) && !MI->isTerminator();
 774   }
 775
 776   bool isVALU(const SUnit *SU) const {
 777     const MachineInstr *MI = SU->getInstr();
 778     return MI && TII->isVALU(*MI);
 779   }
 780
 781   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
 782     if (Pred->NodeNum < Succ->NodeNum)
 783       return true;
 784
 785     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
 786
 787     for (unsigned I = 0; I < Succs.size(); ++I) {
 788       for (const SDep &SI : Succs[I]->Succs) {
 789         const SUnit *SU = SI.getSUnit();
 790         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
 791           Succs.push_back(SU);
 792       }
 793     }
 794
 795     SmallPtrSet<const SUnit*, 32> Visited;
 796     while (!Preds.empty()) {
 797       const SUnit *SU = Preds.pop_back_val();
 798       if (llvm::find(Succs, SU) != Succs.end())
 799         return false;
 800       Visited.insert(SU);
 801       for (const SDep &SI : SU->Preds)
 802         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
 803           Preds.push_back(SI.getSUnit());
 804     }
 805
 806     return true;
 807   }
 808
 809   // Link as much SALU intructions in chain as possible. Return the size
 810   // of the chain. Links up to MaxChain instructions.
 811   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
 812                          SmallPtrSetImpl<SUnit *> &Visited) const {
 813     SmallVector<SUnit *, 8> Worklist({To});
 814     unsigned Linked = 0;
 815
 816     while (!Worklist.empty() && MaxChain-- > 0) {
 817       SUnit *SU = Worklist.pop_back_val();
 818       if (!Visited.insert(SU).second)
 819         continue;
 820
 821       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
 822                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
 823
 824       if (SU->addPred(SDep(From, SDep::Artificial), false))
 825         ++Linked;
 826
 827       for (SDep &SI : From->Succs) {
 828         SUnit *SUv = SI.getSUnit();
 829         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
 830           SUv->addPred(SDep(SU, SDep::Artificial), false);
 831       }
 832
 833       for (SDep &SI : SU->Succs) {
 834         SUnit *Succ = SI.getSUnit();
 835         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
 836           Worklist.push_back(Succ);
 837       }
 838     }
 839
 840     return Linked;
 841   }
 842
 843   void apply(ScheduleDAGInstrs *DAGInstrs) override {
 844     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
 845     if (!ST.hasMAIInsts() || DisablePowerSched)
 846       return;
 847     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 848     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
 849     if (!TSchedModel || DAG->SUnits.empty())
 850       return;
 851
 852     // Scan for MFMA long latency instructions and try to add a dependency
 853     // of available SALU instructions to give them a chance to fill MFMA
 854     // shadow. That is desirable to fill MFMA shadow with SALU instructions
 855     // rather than VALU to prevent power consumption bursts and throttle.
 856     auto LastSALU = DAG->SUnits.begin();
 857     auto E = DAG->SUnits.end();
 858     SmallPtrSet<SUnit*, 32> Visited;
 859     for (SUnit &SU : DAG->SUnits) {
 860       MachineInstr &MAI = *SU.getInstr();
 861       if (!TII->isMAI(MAI) ||
 862            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
 863            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
 864         continue;
 865
 866       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
 867
 868       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
 869                  dbgs() << "Need " << Lat
 870                         << " instructions to cover latency.\n");
 871
 872       // Find up to Lat independent scalar instructions as early as
 873       // possible such that they can be scheduled after this MFMA.
 874       for ( ; Lat && LastSALU != E; ++LastSALU) {
 875         if (Visited.count(&*LastSALU))
 876           continue;
 877
 878         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
 879           continue;
 880
 881         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
 882       }
 883     }
 884   }
 885 };
 886 } // namespace
 887
 888 void GCNSubtarget::getPostRAMutations(
 889     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
 890   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 891 }
 892
 893 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
 894   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
 895     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
 896   else
 897     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
 898 }
 899
 900 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
 901   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
 902     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
 903   else
 904     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
 905 }