contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPUSubtarget.h"
  16 #include "SIMachineFunctionInfo.h"
  17 #include "llvm/ADT/SmallString.h"
  18 #include "llvm/CodeGen/MachineScheduler.h"
  19 #include "llvm/IR/MDBuilder.h"
  20 #include "llvm/Target/TargetFrameLowering.h"
  21 #include <algorithm>
  22
  23 using namespace llvm;
  24
  25 #define DEBUG_TYPE "amdgpu-subtarget"
  26
  27 #define GET_SUBTARGETINFO_TARGET_DESC
  28 #define GET_SUBTARGETINFO_CTOR
  29 #include "AMDGPUGenSubtargetInfo.inc"
  30
  31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
  32
  33 AMDGPUSubtarget &
  34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
  35                                                  StringRef GPU, StringRef FS) {
  36   // Determine default and user-specified characteristics
  37   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  38   // enabled, but some instructions do not respect them and they run at the
  39   // double precision rate, so don't enable by default.
  40   //
  41   // We want to be able to turn these off, but making this a subtarget feature
  42   // for SI has the unhelpful behavior that it unsets everything else if you
  43   // disable it.
  44
  45   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
  46   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  47     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  48
  49   FullFS += FS;
  50
  51   ParseSubtargetFeatures(GPU, FullFS);
  52
  53   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
  54   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
  55   // variants of MUBUF instructions.
  56   if (!hasAddr64() && !FS.contains("flat-for-global")) {
  57     FlatForGlobal = true;
  58   }
  59
  60   // FIXME: I don't think think Evergreen has any useful support for
  61   // denormals, but should be checked. Should we issue a warning somewhere
  62   // if someone tries to enable these?
  63   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  64     FP64FP16Denormals = false;
  65     FP32Denormals = false;
  66   }
  67
  68   // Set defaults if needed.
  69   if (MaxPrivateElementSize == 0)
  70     MaxPrivateElementSize = 4;
  71
  72   return *this;
  73 }
  74
  75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
  76                                  const TargetMachine &TM)
  77   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
  78     TargetTriple(TT),
  79     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
  80     IsaVersion(ISAVersion0_0_0),
  81     WavefrontSize(64),
  82     LocalMemorySize(0),
  83     LDSBankCount(0),
  84     MaxPrivateElementSize(0),
  85
  86     FastFMAF32(false),
  87     HalfRate64Ops(false),
  88
  89     FP32Denormals(false),
  90     FP64FP16Denormals(false),
  91     FPExceptions(false),
  92     DX10Clamp(false),
  93     FlatForGlobal(false),
  94     AutoWaitcntBeforeBarrier(false),
  95     UnalignedScratchAccess(false),
  96     UnalignedBufferAccess(false),
  97
  98     HasApertureRegs(false),
  99     EnableXNACK(false),
 100     TrapHandler(false),
 101     DebuggerInsertNops(false),
 102     DebuggerReserveRegs(false),
 103     DebuggerEmitPrologue(false),
 104
 105     EnableVGPRSpilling(false),
 106     EnablePromoteAlloca(false),
 107     EnableLoadStoreOpt(false),
 108     EnableUnsafeDSOffsetFolding(false),
 109     EnableSIScheduler(false),
 110     DumpCode(false),
 111
 112     FP64(false),
 113     IsGCN(false),
 114     GCN1Encoding(false),
 115     GCN3Encoding(false),
 116     CIInsts(false),
 117     GFX9Insts(false),
 118     SGPRInitBug(false),
 119     HasSMemRealTime(false),
 120     Has16BitInsts(false),
 121     HasVOP3PInsts(false),
 122     HasMovrel(false),
 123     HasVGPRIndexMode(false),
 124     HasScalarStores(false),
 125     HasInv2PiInlineImm(false),
 126     HasSDWA(false),
 127     HasDPP(false),
 128     FlatAddressSpace(false),
 129     FlatInstOffsets(false),
 130     FlatGlobalInsts(false),
 131     FlatScratchInsts(false),
 132
 133     R600ALUInst(false),
 134     CaymanISA(false),
 135     CFALUBug(false),
 136     HasVertexCache(false),
 137     TexVTXClauseSize(0),
 138     ScalarizeGlobal(false),
 139
 140     FeatureDisable(false),
 141     InstrItins(getInstrItineraryForCPU(GPU)) {
 142   AS = AMDGPU::getAMDGPUAS(TT);
 143   initializeSubtargetDependencies(TT, GPU, FS);
 144 }
 145
 146 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 147   const Function &F) const {
 148   if (NWaves == 1)
 149     return getLocalMemorySize();
 150   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 151   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 152   unsigned MaxWaves = getMaxWavesPerEU();
 153   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 154 }
 155
 156 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 157   const Function &F) const {
 158   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 159   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 160   unsigned MaxWaves = getMaxWavesPerEU();
 161   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 162   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 163   NumWaves = std::min(NumWaves, MaxWaves);
 164   NumWaves = std::max(NumWaves, 1u);
 165   return NumWaves;
 166 }
 167
 168 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 169   const Function &F) const {
 170   // Default minimum/maximum flat work group sizes.
 171   std::pair<unsigned, unsigned> Default =
 172     AMDGPU::isCompute(F.getCallingConv()) ?
 173       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
 174                                     getWavefrontSize() * 4) :
 175       std::pair<unsigned, unsigned>(1, getWavefrontSize());
 176
 177   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 178   // starts using "amdgpu-flat-work-group-size" attribute.
 179   Default.second = AMDGPU::getIntegerAttribute(
 180     F, "amdgpu-max-work-group-size", Default.second);
 181   Default.first = std::min(Default.first, Default.second);
 182
 183   // Requested minimum/maximum flat work group sizes.
 184   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 185     F, "amdgpu-flat-work-group-size", Default);
 186
 187   // Make sure requested minimum is less than requested maximum.
 188   if (Requested.first > Requested.second)
 189     return Default;
 190
 191   // Make sure requested values do not violate subtarget's specifications.
 192   if (Requested.first < getMinFlatWorkGroupSize())
 193     return Default;
 194   if (Requested.second > getMaxFlatWorkGroupSize())
 195     return Default;
 196
 197   return Requested;
 198 }
 199
 200 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 201   const Function &F) const {
 202   // Default minimum/maximum number of waves per execution unit.
 203   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 204
 205   // Default/requested minimum/maximum flat work group sizes.
 206   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 207
 208   // If minimum/maximum flat work group sizes were explicitly requested using
 209   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 210   // number of waves per execution unit to values implied by requested
 211   // minimum/maximum flat work group sizes.
 212   unsigned MinImpliedByFlatWorkGroupSize =
 213     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 214   bool RequestedFlatWorkGroupSize = false;
 215
 216   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 217   // starts using "amdgpu-flat-work-group-size" attribute.
 218   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
 219       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 220     Default.first = MinImpliedByFlatWorkGroupSize;
 221     RequestedFlatWorkGroupSize = true;
 222   }
 223
 224   // Requested minimum/maximum number of waves per execution unit.
 225   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 226     F, "amdgpu-waves-per-eu", Default, true);
 227
 228   // Make sure requested minimum is less than requested maximum.
 229   if (Requested.second && Requested.first > Requested.second)
 230     return Default;
 231
 232   // Make sure requested values do not violate subtarget's specifications.
 233   if (Requested.first < getMinWavesPerEU() ||
 234       Requested.first > getMaxWavesPerEU())
 235     return Default;
 236   if (Requested.second > getMaxWavesPerEU())
 237     return Default;
 238
 239   // Make sure requested values are compatible with values implied by requested
 240   // minimum/maximum flat work group sizes.
 241   if (RequestedFlatWorkGroupSize &&
 242       Requested.first > MinImpliedByFlatWorkGroupSize)
 243     return Default;
 244
 245   return Requested;
 246 }
 247
 248 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 249   Function *Kernel = I->getParent()->getParent();
 250   unsigned MinSize = 0;
 251   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 252   bool IdQuery = false;
 253
 254   // If reqd_work_group_size is present it narrows value down.
 255   if (auto *CI = dyn_cast<CallInst>(I)) {
 256     const Function *F = CI->getCalledFunction();
 257     if (F) {
 258       unsigned Dim = UINT_MAX;
 259       switch (F->getIntrinsicID()) {
 260       case Intrinsic::amdgcn_workitem_id_x:
 261       case Intrinsic::r600_read_tidig_x:
 262         IdQuery = true;
 263       case Intrinsic::r600_read_local_size_x:
 264         Dim = 0;
 265         break;
 266       case Intrinsic::amdgcn_workitem_id_y:
 267       case Intrinsic::r600_read_tidig_y:
 268         IdQuery = true;
 269       case Intrinsic::r600_read_local_size_y:
 270         Dim = 1;
 271         break;
 272       case Intrinsic::amdgcn_workitem_id_z:
 273       case Intrinsic::r600_read_tidig_z:
 274         IdQuery = true;
 275       case Intrinsic::r600_read_local_size_z:
 276         Dim = 2;
 277         break;
 278       default:
 279         break;
 280       }
 281       if (Dim <= 3) {
 282         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 283           if (Node->getNumOperands() == 3)
 284             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 285                                   Node->getOperand(Dim))->getZExtValue();
 286       }
 287     }
 288   }
 289
 290   if (!MaxSize)
 291     return false;
 292
 293   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 294   // as Hi. For size query we need to pass Hi + 1.
 295   if (IdQuery)
 296     MinSize = 0;
 297   else
 298     ++MaxSize;
 299
 300   MDBuilder MDB(I->getContext());
 301   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 302                                                   APInt(32, MaxSize));
 303   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 304   return true;
 305 }
 306
 307 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 308                              const TargetMachine &TM) :
 309   AMDGPUSubtarget(TT, GPU, FS, TM),
 310   InstrInfo(*this),
 311   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 312   TLInfo(TM, *this) {}
 313
 314 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 315                          const TargetMachine &TM) :
 316   AMDGPUSubtarget(TT, GPU, FS, TM),
 317   InstrInfo(*this),
 318   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 319   TLInfo(TM, *this) {}
 320
 321 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 322                                       unsigned NumRegionInstrs) const {
 323   // Track register pressure so the scheduler can try to decrease
 324   // pressure once register usage is above the threshold defined by
 325   // SIRegisterInfo::getRegPressureSetLimit()
 326   Policy.ShouldTrackPressure = true;
 327
 328   // Enabling both top down and bottom up scheduling seems to give us less
 329   // register spills than just using one of these approaches on its own.
 330   Policy.OnlyTopDown = false;
 331   Policy.OnlyBottomUp = false;
 332
 333   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 334   if (!enableSIScheduler())
 335     Policy.ShouldTrackLaneMasks = true;
 336 }
 337
 338 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
 339   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 340 }
 341
 342 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
 343                                             unsigned ExplicitArgBytes) const {
 344   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
 345   if (ImplicitBytes == 0)
 346     return ExplicitArgBytes;
 347
 348   unsigned Alignment = getAlignmentForImplicitArgPtr();
 349   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 350 }
 351
 352 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 353   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
 354     if (SGPRs <= 80)
 355       return 10;
 356     if (SGPRs <= 88)
 357       return 9;
 358     if (SGPRs <= 100)
 359       return 8;
 360     return 7;
 361   }
 362   if (SGPRs <= 48)
 363     return 10;
 364   if (SGPRs <= 56)
 365     return 9;
 366   if (SGPRs <= 64)
 367     return 8;
 368   if (SGPRs <= 72)
 369     return 7;
 370   if (SGPRs <= 80)
 371     return 6;
 372   return 5;
 373 }
 374
 375 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 376   if (VGPRs <= 24)
 377     return 10;
 378   if (VGPRs <= 28)
 379     return 9;
 380   if (VGPRs <= 32)
 381     return 8;
 382   if (VGPRs <= 36)
 383     return 7;
 384   if (VGPRs <= 40)
 385     return 6;
 386   if (VGPRs <= 48)
 387     return 5;
 388   if (VGPRs <= 64)
 389     return 4;
 390   if (VGPRs <= 84)
 391     return 3;
 392   if (VGPRs <= 128)
 393     return 2;
 394   return 1;
 395 }
 396
 397 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 398   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 399   if (MFI.hasFlatScratchInit()) {
 400     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 401       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 402     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 403       return 4; // FLAT_SCRATCH, VCC (in that order).
 404   }
 405
 406   if (isXNACKEnabled())
 407     return 4; // XNACK, VCC (in that order).
 408   return 2; // VCC.
 409 }
 410
 411 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 412   const Function &F = *MF.getFunction();
 413   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 414
 415   // Compute maximum number of SGPRs function can use using default/requested
 416   // minimum number of waves per execution unit.
 417   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 418   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 419   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 420
 421   // Check if maximum number of SGPRs was explicitly requested using
 422   // "amdgpu-num-sgpr" attribute.
 423   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 424     unsigned Requested = AMDGPU::getIntegerAttribute(
 425       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 426
 427     // Make sure requested value does not violate subtarget's specifications.
 428     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 429       Requested = 0;
 430
 431     // If more SGPRs are required to support the input user/system SGPRs,
 432     // increase to accommodate them.
 433     //
 434     // FIXME: This really ends up using the requested number of SGPRs + number
 435     // of reserved special registers in total. Theoretically you could re-use
 436     // the last input registers for these special registers, but this would
 437     // require a lot of complexity to deal with the weird aliasing.
 438     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 439     if (Requested && Requested < InputNumSGPRs)
 440       Requested = InputNumSGPRs;
 441
 442     // Make sure requested value is compatible with values implied by
 443     // default/requested minimum/maximum number of waves per execution unit.
 444     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 445       Requested = 0;
 446     if (WavesPerEU.second &&
 447         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 448       Requested = 0;
 449
 450     if (Requested)
 451       MaxNumSGPRs = Requested;
 452   }
 453
 454   if (hasSGPRInitBug())
 455     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 456
 457   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 458                   MaxAddressableNumSGPRs);
 459 }
 460
 461 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 462   const Function &F = *MF.getFunction();
 463   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 464
 465   // Compute maximum number of VGPRs function can use using default/requested
 466   // minimum number of waves per execution unit.
 467   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 468   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 469
 470   // Check if maximum number of VGPRs was explicitly requested using
 471   // "amdgpu-num-vgpr" attribute.
 472   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 473     unsigned Requested = AMDGPU::getIntegerAttribute(
 474       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 475
 476     // Make sure requested value does not violate subtarget's specifications.
 477     if (Requested && Requested <= getReservedNumVGPRs(MF))
 478       Requested = 0;
 479
 480     // Make sure requested value is compatible with values implied by
 481     // default/requested minimum/maximum number of waves per execution unit.
 482     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 483       Requested = 0;
 484     if (WavesPerEU.second &&
 485         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 486       Requested = 0;
 487
 488     if (Requested)
 489       MaxNumVGPRs = Requested;
 490   }
 491
 492   return MaxNumVGPRs - getReservedNumVGPRs(MF);
 493 }