contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPUSubtarget.h"
  16 #include "AMDGPU.h"
  17 #include "AMDGPUTargetMachine.h"
  18 #ifdef LLVM_BUILD_GLOBAL_ISEL
  19 #include "AMDGPUCallLowering.h"
  20 #include "AMDGPUInstructionSelector.h"
  21 #include "AMDGPULegalizerInfo.h"
  22 #include "AMDGPURegisterBankInfo.h"
  23 #endif
  24 #include "SIMachineFunctionInfo.h"
  25 #include "llvm/ADT/SmallString.h"
  26 #include "llvm/CodeGen/MachineScheduler.h"
  27 #include "llvm/IR/MDBuilder.h"
  28 #include "llvm/Target/TargetFrameLowering.h"
  29 #include <algorithm>
  30
  31 using namespace llvm;
  32
  33 #define DEBUG_TYPE "amdgpu-subtarget"
  34
  35 #define GET_SUBTARGETINFO_TARGET_DESC
  36 #define GET_SUBTARGETINFO_CTOR
  37 #include "AMDGPUGenSubtargetInfo.inc"
  38
  39 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
  40
  41 AMDGPUSubtarget &
  42 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
  43                                                  StringRef GPU, StringRef FS) {
  44   // Determine default and user-specified characteristics
  45   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  46   // enabled, but some instructions do not respect them and they run at the
  47   // double precision rate, so don't enable by default.
  48   //
  49   // We want to be able to turn these off, but making this a subtarget feature
  50   // for SI has the unhelpful behavior that it unsets everything else if you
  51   // disable it.
  52
  53   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
  54   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  55     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  56
  57   FullFS += FS;
  58
  59   ParseSubtargetFeatures(GPU, FullFS);
  60
  61   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
  62   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
  63   // variants of MUBUF instructions.
  64   if (!hasAddr64() && !FS.contains("flat-for-global")) {
  65     FlatForGlobal = true;
  66   }
  67
  68   // FIXME: I don't think think Evergreen has any useful support for
  69   // denormals, but should be checked. Should we issue a warning somewhere
  70   // if someone tries to enable these?
  71   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  72     FP64FP16Denormals = false;
  73     FP32Denormals = false;
  74   }
  75
  76   // Set defaults if needed.
  77   if (MaxPrivateElementSize == 0)
  78     MaxPrivateElementSize = 4;
  79
  80   return *this;
  81 }
  82
  83 #ifdef LLVM_BUILD_GLOBAL_ISEL
  84 namespace {
  85
  86 struct SIGISelActualAccessor : public GISelAccessor {
  87   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
  88   std::unique_ptr<InstructionSelector> InstSelector;
  89   std::unique_ptr<LegalizerInfo> Legalizer;
  90   std::unique_ptr<RegisterBankInfo> RegBankInfo;
  91   const AMDGPUCallLowering *getCallLowering() const override {
  92     return CallLoweringInfo.get();
  93   }
  94   const InstructionSelector *getInstructionSelector() const override {
  95     return InstSelector.get();
  96   }
  97   const LegalizerInfo *getLegalizerInfo() const override {
  98     return Legalizer.get();
  99   }
 100   const RegisterBankInfo *getRegBankInfo() const override {
 101     return RegBankInfo.get();
 102   }
 103 };
 104
 105 } // end anonymous namespace
 106 #endif
 107
 108 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 109                                  const TargetMachine &TM)
 110   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
 111     TargetTriple(TT),
 112     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
 113     IsaVersion(ISAVersion0_0_0),
 114     WavefrontSize(64),
 115     LocalMemorySize(0),
 116     LDSBankCount(0),
 117     MaxPrivateElementSize(0),
 118
 119     FastFMAF32(false),
 120     HalfRate64Ops(false),
 121
 122     FP32Denormals(false),
 123     FP64FP16Denormals(false),
 124     FPExceptions(false),
 125     DX10Clamp(false),
 126     FlatForGlobal(false),
 127     AutoWaitcntBeforeBarrier(false),
 128     UnalignedScratchAccess(false),
 129     UnalignedBufferAccess(false),
 130
 131     HasApertureRegs(false),
 132     EnableXNACK(false),
 133     TrapHandler(false),
 134     DebuggerInsertNops(false),
 135     DebuggerReserveRegs(false),
 136     DebuggerEmitPrologue(false),
 137
 138     EnableVGPRSpilling(false),
 139     EnablePromoteAlloca(false),
 140     EnableLoadStoreOpt(false),
 141     EnableUnsafeDSOffsetFolding(false),
 142     EnableSIScheduler(false),
 143     DumpCode(false),
 144
 145     FP64(false),
 146     IsGCN(false),
 147     GCN1Encoding(false),
 148     GCN3Encoding(false),
 149     CIInsts(false),
 150     GFX9Insts(false),
 151     SGPRInitBug(false),
 152     HasSMemRealTime(false),
 153     Has16BitInsts(false),
 154     HasVOP3PInsts(false),
 155     HasMovrel(false),
 156     HasVGPRIndexMode(false),
 157     HasScalarStores(false),
 158     HasInv2PiInlineImm(false),
 159     HasSDWA(false),
 160     HasSDWAOmod(false),
 161     HasSDWAScalar(false),
 162     HasSDWASdst(false),
 163     HasSDWAMac(false),
 164     HasSDWAOutModsVOPC(false),
 165     HasDPP(false),
 166     FlatAddressSpace(false),
 167     FlatInstOffsets(false),
 168     FlatGlobalInsts(false),
 169     FlatScratchInsts(false),
 170
 171     R600ALUInst(false),
 172     CaymanISA(false),
 173     CFALUBug(false),
 174     HasVertexCache(false),
 175     TexVTXClauseSize(0),
 176     ScalarizeGlobal(false),
 177
 178     FeatureDisable(false),
 179     InstrItins(getInstrItineraryForCPU(GPU)) {
 180   AS = AMDGPU::getAMDGPUAS(TT);
 181   initializeSubtargetDependencies(TT, GPU, FS);
 182 }
 183
 184 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 185   const Function &F) const {
 186   if (NWaves == 1)
 187     return getLocalMemorySize();
 188   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 189   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 190   unsigned MaxWaves = getMaxWavesPerEU();
 191   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 192 }
 193
 194 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 195   const Function &F) const {
 196   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 197   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 198   unsigned MaxWaves = getMaxWavesPerEU();
 199   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 200   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 201   NumWaves = std::min(NumWaves, MaxWaves);
 202   NumWaves = std::max(NumWaves, 1u);
 203   return NumWaves;
 204 }
 205
 206 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 207   const Function &F) const {
 208   // Default minimum/maximum flat work group sizes.
 209   std::pair<unsigned, unsigned> Default =
 210     AMDGPU::isCompute(F.getCallingConv()) ?
 211       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
 212                                     getWavefrontSize() * 4) :
 213       std::pair<unsigned, unsigned>(1, getWavefrontSize());
 214
 215   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 216   // starts using "amdgpu-flat-work-group-size" attribute.
 217   Default.second = AMDGPU::getIntegerAttribute(
 218     F, "amdgpu-max-work-group-size", Default.second);
 219   Default.first = std::min(Default.first, Default.second);
 220
 221   // Requested minimum/maximum flat work group sizes.
 222   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 223     F, "amdgpu-flat-work-group-size", Default);
 224
 225   // Make sure requested minimum is less than requested maximum.
 226   if (Requested.first > Requested.second)
 227     return Default;
 228
 229   // Make sure requested values do not violate subtarget's specifications.
 230   if (Requested.first < getMinFlatWorkGroupSize())
 231     return Default;
 232   if (Requested.second > getMaxFlatWorkGroupSize())
 233     return Default;
 234
 235   return Requested;
 236 }
 237
 238 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 239   const Function &F) const {
 240   // Default minimum/maximum number of waves per execution unit.
 241   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 242
 243   // Default/requested minimum/maximum flat work group sizes.
 244   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 245
 246   // If minimum/maximum flat work group sizes were explicitly requested using
 247   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 248   // number of waves per execution unit to values implied by requested
 249   // minimum/maximum flat work group sizes.
 250   unsigned MinImpliedByFlatWorkGroupSize =
 251     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 252   bool RequestedFlatWorkGroupSize = false;
 253
 254   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 255   // starts using "amdgpu-flat-work-group-size" attribute.
 256   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
 257       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 258     Default.first = MinImpliedByFlatWorkGroupSize;
 259     RequestedFlatWorkGroupSize = true;
 260   }
 261
 262   // Requested minimum/maximum number of waves per execution unit.
 263   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 264     F, "amdgpu-waves-per-eu", Default, true);
 265
 266   // Make sure requested minimum is less than requested maximum.
 267   if (Requested.second && Requested.first > Requested.second)
 268     return Default;
 269
 270   // Make sure requested values do not violate subtarget's specifications.
 271   if (Requested.first < getMinWavesPerEU() ||
 272       Requested.first > getMaxWavesPerEU())
 273     return Default;
 274   if (Requested.second > getMaxWavesPerEU())
 275     return Default;
 276
 277   // Make sure requested values are compatible with values implied by requested
 278   // minimum/maximum flat work group sizes.
 279   if (RequestedFlatWorkGroupSize &&
 280       Requested.first < MinImpliedByFlatWorkGroupSize)
 281     return Default;
 282
 283   return Requested;
 284 }
 285
 286 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 287   Function *Kernel = I->getParent()->getParent();
 288   unsigned MinSize = 0;
 289   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 290   bool IdQuery = false;
 291
 292   // If reqd_work_group_size is present it narrows value down.
 293   if (auto *CI = dyn_cast<CallInst>(I)) {
 294     const Function *F = CI->getCalledFunction();
 295     if (F) {
 296       unsigned Dim = UINT_MAX;
 297       switch (F->getIntrinsicID()) {
 298       case Intrinsic::amdgcn_workitem_id_x:
 299       case Intrinsic::r600_read_tidig_x:
 300         IdQuery = true;
 301         LLVM_FALLTHROUGH;
 302       case Intrinsic::r600_read_local_size_x:
 303         Dim = 0;
 304         break;
 305       case Intrinsic::amdgcn_workitem_id_y:
 306       case Intrinsic::r600_read_tidig_y:
 307         IdQuery = true;
 308         LLVM_FALLTHROUGH;
 309       case Intrinsic::r600_read_local_size_y:
 310         Dim = 1;
 311         break;
 312       case Intrinsic::amdgcn_workitem_id_z:
 313       case Intrinsic::r600_read_tidig_z:
 314         IdQuery = true;
 315         LLVM_FALLTHROUGH;
 316       case Intrinsic::r600_read_local_size_z:
 317         Dim = 2;
 318         break;
 319       default:
 320         break;
 321       }
 322       if (Dim <= 3) {
 323         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 324           if (Node->getNumOperands() == 3)
 325             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 326                                   Node->getOperand(Dim))->getZExtValue();
 327       }
 328     }
 329   }
 330
 331   if (!MaxSize)
 332     return false;
 333
 334   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 335   // as Hi. For size query we need to pass Hi + 1.
 336   if (IdQuery)
 337     MinSize = 0;
 338   else
 339     ++MaxSize;
 340
 341   MDBuilder MDB(I->getContext());
 342   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 343                                                   APInt(32, MaxSize));
 344   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 345   return true;
 346 }
 347
 348 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 349                              const TargetMachine &TM) :
 350   AMDGPUSubtarget(TT, GPU, FS, TM),
 351   InstrInfo(*this),
 352   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 353   TLInfo(TM, *this) {}
 354
 355 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 356                          const TargetMachine &TM)
 357     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
 358       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 359       TLInfo(TM, *this) {
 360 #ifndef LLVM_BUILD_GLOBAL_ISEL
 361   GISelAccessor *GISel = new GISelAccessor();
 362 #else
 363   SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
 364   GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 365   GISel->Legalizer.reset(new AMDGPULegalizerInfo());
 366
 367   GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
 368   GISel->InstSelector.reset(new AMDGPUInstructionSelector(
 369       *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get())));
 370 #endif
 371   setGISelAccessor(*GISel);
 372 }
 373
 374 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 375                                       unsigned NumRegionInstrs) const {
 376   // Track register pressure so the scheduler can try to decrease
 377   // pressure once register usage is above the threshold defined by
 378   // SIRegisterInfo::getRegPressureSetLimit()
 379   Policy.ShouldTrackPressure = true;
 380
 381   // Enabling both top down and bottom up scheduling seems to give us less
 382   // register spills than just using one of these approaches on its own.
 383   Policy.OnlyTopDown = false;
 384   Policy.OnlyBottomUp = false;
 385
 386   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 387   if (!enableSIScheduler())
 388     Policy.ShouldTrackLaneMasks = true;
 389 }
 390
 391 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
 392   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 393 }
 394
 395 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
 396                                             unsigned ExplicitArgBytes) const {
 397   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
 398   if (ImplicitBytes == 0)
 399     return ExplicitArgBytes;
 400
 401   unsigned Alignment = getAlignmentForImplicitArgPtr();
 402   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 403 }
 404
 405 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 406   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
 407     if (SGPRs <= 80)
 408       return 10;
 409     if (SGPRs <= 88)
 410       return 9;
 411     if (SGPRs <= 100)
 412       return 8;
 413     return 7;
 414   }
 415   if (SGPRs <= 48)
 416     return 10;
 417   if (SGPRs <= 56)
 418     return 9;
 419   if (SGPRs <= 64)
 420     return 8;
 421   if (SGPRs <= 72)
 422     return 7;
 423   if (SGPRs <= 80)
 424     return 6;
 425   return 5;
 426 }
 427
 428 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 429   if (VGPRs <= 24)
 430     return 10;
 431   if (VGPRs <= 28)
 432     return 9;
 433   if (VGPRs <= 32)
 434     return 8;
 435   if (VGPRs <= 36)
 436     return 7;
 437   if (VGPRs <= 40)
 438     return 6;
 439   if (VGPRs <= 48)
 440     return 5;
 441   if (VGPRs <= 64)
 442     return 4;
 443   if (VGPRs <= 84)
 444     return 3;
 445   if (VGPRs <= 128)
 446     return 2;
 447   return 1;
 448 }
 449
 450 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 451   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 452   if (MFI.hasFlatScratchInit()) {
 453     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 454       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 455     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 456       return 4; // FLAT_SCRATCH, VCC (in that order).
 457   }
 458
 459   if (isXNACKEnabled())
 460     return 4; // XNACK, VCC (in that order).
 461   return 2; // VCC.
 462 }
 463
 464 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 465   const Function &F = *MF.getFunction();
 466   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 467
 468   // Compute maximum number of SGPRs function can use using default/requested
 469   // minimum number of waves per execution unit.
 470   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 471   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 472   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 473
 474   // Check if maximum number of SGPRs was explicitly requested using
 475   // "amdgpu-num-sgpr" attribute.
 476   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 477     unsigned Requested = AMDGPU::getIntegerAttribute(
 478       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 479
 480     // Make sure requested value does not violate subtarget's specifications.
 481     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 482       Requested = 0;
 483
 484     // If more SGPRs are required to support the input user/system SGPRs,
 485     // increase to accommodate them.
 486     //
 487     // FIXME: This really ends up using the requested number of SGPRs + number
 488     // of reserved special registers in total. Theoretically you could re-use
 489     // the last input registers for these special registers, but this would
 490     // require a lot of complexity to deal with the weird aliasing.
 491     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 492     if (Requested && Requested < InputNumSGPRs)
 493       Requested = InputNumSGPRs;
 494
 495     // Make sure requested value is compatible with values implied by
 496     // default/requested minimum/maximum number of waves per execution unit.
 497     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 498       Requested = 0;
 499     if (WavesPerEU.second &&
 500         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 501       Requested = 0;
 502
 503     if (Requested)
 504       MaxNumSGPRs = Requested;
 505   }
 506
 507   if (hasSGPRInitBug())
 508     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 509
 510   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 511                   MaxAddressableNumSGPRs);
 512 }
 513
 514 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 515   const Function &F = *MF.getFunction();
 516   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 517
 518   // Compute maximum number of VGPRs function can use using default/requested
 519   // minimum number of waves per execution unit.
 520   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 521   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 522
 523   // Check if maximum number of VGPRs was explicitly requested using
 524   // "amdgpu-num-vgpr" attribute.
 525   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 526     unsigned Requested = AMDGPU::getIntegerAttribute(
 527       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 528
 529     // Make sure requested value does not violate subtarget's specifications.
 530     if (Requested && Requested <= getReservedNumVGPRs(MF))
 531       Requested = 0;
 532
 533     // Make sure requested value is compatible with values implied by
 534     // default/requested minimum/maximum number of waves per execution unit.
 535     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 536       Requested = 0;
 537     if (WavesPerEU.second &&
 538         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 539       Requested = 0;
 540
 541     if (Requested)
 542       MaxNumVGPRs = Requested;
 543   }
 544
 545   return MaxNumVGPRs - getReservedNumVGPRs(MF);
 546 }