contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPUSubtarget.h"
  16 #include "SIMachineFunctionInfo.h"
  17 #include "llvm/ADT/SmallString.h"
  18 #include "llvm/CodeGen/MachineScheduler.h"
  19 #include "llvm/IR/MDBuilder.h"
  20 #include "llvm/Target/TargetFrameLowering.h"
  21 #include <algorithm>
  22
  23 using namespace llvm;
  24
  25 #define DEBUG_TYPE "amdgpu-subtarget"
  26
  27 #define GET_SUBTARGETINFO_TARGET_DESC
  28 #define GET_SUBTARGETINFO_CTOR
  29 #include "AMDGPUGenSubtargetInfo.inc"
  30
  31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
  32
  33 AMDGPUSubtarget &
  34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
  35                                                  StringRef GPU, StringRef FS) {
  36   // Determine default and user-specified characteristics
  37   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  38   // enabled, but some instructions do not respect them and they run at the
  39   // double precision rate, so don't enable by default.
  40   //
  41   // We want to be able to turn these off, but making this a subtarget feature
  42   // for SI has the unhelpful behavior that it unsets everything else if you
  43   // disable it.
  44
  45   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
  46   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  47     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  48
  49   FullFS += FS;
  50
  51   ParseSubtargetFeatures(GPU, FullFS);
  52
  53   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
  54   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
  55   // variants of MUBUF instructions.
  56   if (!hasAddr64() && !FS.contains("flat-for-global")) {
  57     FlatForGlobal = true;
  58   }
  59
  60   // FIXME: I don't think think Evergreen has any useful support for
  61   // denormals, but should be checked. Should we issue a warning somewhere
  62   // if someone tries to enable these?
  63   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  64     FP64FP16Denormals = false;
  65     FP32Denormals = false;
  66   }
  67
  68   // Set defaults if needed.
  69   if (MaxPrivateElementSize == 0)
  70     MaxPrivateElementSize = 4;
  71
  72   return *this;
  73 }
  74
  75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
  76                                  const TargetMachine &TM)
  77   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
  78     TargetTriple(TT),
  79     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
  80     IsaVersion(ISAVersion0_0_0),
  81     WavefrontSize(64),
  82     LocalMemorySize(0),
  83     LDSBankCount(0),
  84     MaxPrivateElementSize(0),
  85
  86     FastFMAF32(false),
  87     HalfRate64Ops(false),
  88
  89     FP32Denormals(false),
  90     FP64FP16Denormals(false),
  91     FPExceptions(false),
  92     DX10Clamp(false),
  93     FlatForGlobal(false),
  94     UnalignedScratchAccess(false),
  95     UnalignedBufferAccess(false),
  96
  97     HasApertureRegs(false),
  98     EnableXNACK(false),
  99     TrapHandler(false),
 100     DebuggerInsertNops(false),
 101     DebuggerReserveRegs(false),
 102     DebuggerEmitPrologue(false),
 103
 104     EnableVGPRSpilling(false),
 105     EnablePromoteAlloca(false),
 106     EnableLoadStoreOpt(false),
 107     EnableUnsafeDSOffsetFolding(false),
 108     EnableSIScheduler(false),
 109     DumpCode(false),
 110
 111     FP64(false),
 112     IsGCN(false),
 113     GCN1Encoding(false),
 114     GCN3Encoding(false),
 115     CIInsts(false),
 116     GFX9Insts(false),
 117     SGPRInitBug(false),
 118     HasSMemRealTime(false),
 119     Has16BitInsts(false),
 120     HasVOP3PInsts(false),
 121     HasMovrel(false),
 122     HasVGPRIndexMode(false),
 123     HasScalarStores(false),
 124     HasInv2PiInlineImm(false),
 125     HasSDWA(false),
 126     HasDPP(false),
 127     FlatAddressSpace(false),
 128
 129     R600ALUInst(false),
 130     CaymanISA(false),
 131     CFALUBug(false),
 132     HasVertexCache(false),
 133     TexVTXClauseSize(0),
 134     ScalarizeGlobal(false),
 135
 136     FeatureDisable(false),
 137     InstrItins(getInstrItineraryForCPU(GPU)) {
 138   AS = AMDGPU::getAMDGPUAS(TT);
 139   initializeSubtargetDependencies(TT, GPU, FS);
 140 }
 141
 142 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 143   const Function &F) const {
 144   if (NWaves == 1)
 145     return getLocalMemorySize();
 146   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 147   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 148   unsigned MaxWaves = getMaxWavesPerEU();
 149   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 150 }
 151
 152 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 153   const Function &F) const {
 154   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 155   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 156   unsigned MaxWaves = getMaxWavesPerEU();
 157   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 158   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 159   NumWaves = std::min(NumWaves, MaxWaves);
 160   NumWaves = std::max(NumWaves, 1u);
 161   return NumWaves;
 162 }
 163
 164 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 165   const Function &F) const {
 166   // Default minimum/maximum flat work group sizes.
 167   std::pair<unsigned, unsigned> Default =
 168     AMDGPU::isCompute(F.getCallingConv()) ?
 169       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
 170                                     getWavefrontSize() * 4) :
 171       std::pair<unsigned, unsigned>(1, getWavefrontSize());
 172
 173   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 174   // starts using "amdgpu-flat-work-group-size" attribute.
 175   Default.second = AMDGPU::getIntegerAttribute(
 176     F, "amdgpu-max-work-group-size", Default.second);
 177   Default.first = std::min(Default.first, Default.second);
 178
 179   // Requested minimum/maximum flat work group sizes.
 180   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 181     F, "amdgpu-flat-work-group-size", Default);
 182
 183   // Make sure requested minimum is less than requested maximum.
 184   if (Requested.first > Requested.second)
 185     return Default;
 186
 187   // Make sure requested values do not violate subtarget's specifications.
 188   if (Requested.first < getMinFlatWorkGroupSize())
 189     return Default;
 190   if (Requested.second > getMaxFlatWorkGroupSize())
 191     return Default;
 192
 193   return Requested;
 194 }
 195
 196 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 197   const Function &F) const {
 198   // Default minimum/maximum number of waves per execution unit.
 199   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 200
 201   // Default/requested minimum/maximum flat work group sizes.
 202   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 203
 204   // If minimum/maximum flat work group sizes were explicitly requested using
 205   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 206   // number of waves per execution unit to values implied by requested
 207   // minimum/maximum flat work group sizes.
 208   unsigned MinImpliedByFlatWorkGroupSize =
 209     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 210   bool RequestedFlatWorkGroupSize = false;
 211
 212   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 213   // starts using "amdgpu-flat-work-group-size" attribute.
 214   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
 215       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 216     Default.first = MinImpliedByFlatWorkGroupSize;
 217     RequestedFlatWorkGroupSize = true;
 218   }
 219
 220   // Requested minimum/maximum number of waves per execution unit.
 221   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 222     F, "amdgpu-waves-per-eu", Default, true);
 223
 224   // Make sure requested minimum is less than requested maximum.
 225   if (Requested.second && Requested.first > Requested.second)
 226     return Default;
 227
 228   // Make sure requested values do not violate subtarget's specifications.
 229   if (Requested.first < getMinWavesPerEU() ||
 230       Requested.first > getMaxWavesPerEU())
 231     return Default;
 232   if (Requested.second > getMaxWavesPerEU())
 233     return Default;
 234
 235   // Make sure requested values are compatible with values implied by requested
 236   // minimum/maximum flat work group sizes.
 237   if (RequestedFlatWorkGroupSize &&
 238       Requested.first > MinImpliedByFlatWorkGroupSize)
 239     return Default;
 240
 241   return Requested;
 242 }
 243
 244 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 245   Function *Kernel = I->getParent()->getParent();
 246   unsigned MinSize = 0;
 247   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 248   bool IdQuery = false;
 249
 250   // If reqd_work_group_size is present it narrows value down.
 251   if (auto *CI = dyn_cast<CallInst>(I)) {
 252     const Function *F = CI->getCalledFunction();
 253     if (F) {
 254       unsigned Dim = UINT_MAX;
 255       switch (F->getIntrinsicID()) {
 256       case Intrinsic::amdgcn_workitem_id_x:
 257       case Intrinsic::r600_read_tidig_x:
 258         IdQuery = true;
 259       case Intrinsic::r600_read_local_size_x:
 260         Dim = 0;
 261         break;
 262       case Intrinsic::amdgcn_workitem_id_y:
 263       case Intrinsic::r600_read_tidig_y:
 264         IdQuery = true;
 265       case Intrinsic::r600_read_local_size_y:
 266         Dim = 1;
 267         break;
 268       case Intrinsic::amdgcn_workitem_id_z:
 269       case Intrinsic::r600_read_tidig_z:
 270         IdQuery = true;
 271       case Intrinsic::r600_read_local_size_z:
 272         Dim = 2;
 273         break;
 274       default:
 275         break;
 276       }
 277       if (Dim <= 3) {
 278         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 279           if (Node->getNumOperands() == 3)
 280             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 281                                   Node->getOperand(Dim))->getZExtValue();
 282       }
 283     }
 284   }
 285
 286   if (!MaxSize)
 287     return false;
 288
 289   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 290   // as Hi. For size query we need to pass Hi + 1.
 291   if (IdQuery)
 292     MinSize = 0;
 293   else
 294     ++MaxSize;
 295
 296   MDBuilder MDB(I->getContext());
 297   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 298                                                   APInt(32, MaxSize));
 299   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 300   return true;
 301 }
 302
 303 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 304                              const TargetMachine &TM) :
 305   AMDGPUSubtarget(TT, GPU, FS, TM),
 306   InstrInfo(*this),
 307   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 308   TLInfo(TM, *this) {}
 309
 310 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 311                          const TargetMachine &TM) :
 312   AMDGPUSubtarget(TT, GPU, FS, TM),
 313   InstrInfo(*this),
 314   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 315   TLInfo(TM, *this) {}
 316
 317 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 318                                       unsigned NumRegionInstrs) const {
 319   // Track register pressure so the scheduler can try to decrease
 320   // pressure once register usage is above the threshold defined by
 321   // SIRegisterInfo::getRegPressureSetLimit()
 322   Policy.ShouldTrackPressure = true;
 323
 324   // Enabling both top down and bottom up scheduling seems to give us less
 325   // register spills than just using one of these approaches on its own.
 326   Policy.OnlyTopDown = false;
 327   Policy.OnlyBottomUp = false;
 328
 329   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 330   if (!enableSIScheduler())
 331     Policy.ShouldTrackLaneMasks = true;
 332 }
 333
 334 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
 335   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 336 }
 337
 338 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
 339                                             unsigned ExplicitArgBytes) const {
 340   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
 341   if (ImplicitBytes == 0)
 342     return ExplicitArgBytes;
 343
 344   unsigned Alignment = getAlignmentForImplicitArgPtr();
 345   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 346 }
 347
 348 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 349   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
 350     if (SGPRs <= 80)
 351       return 10;
 352     if (SGPRs <= 88)
 353       return 9;
 354     if (SGPRs <= 100)
 355       return 8;
 356     return 7;
 357   }
 358   if (SGPRs <= 48)
 359     return 10;
 360   if (SGPRs <= 56)
 361     return 9;
 362   if (SGPRs <= 64)
 363     return 8;
 364   if (SGPRs <= 72)
 365     return 7;
 366   if (SGPRs <= 80)
 367     return 6;
 368   return 5;
 369 }
 370
 371 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 372   if (VGPRs <= 24)
 373     return 10;
 374   if (VGPRs <= 28)
 375     return 9;
 376   if (VGPRs <= 32)
 377     return 8;
 378   if (VGPRs <= 36)
 379     return 7;
 380   if (VGPRs <= 40)
 381     return 6;
 382   if (VGPRs <= 48)
 383     return 5;
 384   if (VGPRs <= 64)
 385     return 4;
 386   if (VGPRs <= 84)
 387     return 3;
 388   if (VGPRs <= 128)
 389     return 2;
 390   return 1;
 391 }
 392
 393 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 394   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 395   if (MFI.hasFlatScratchInit()) {
 396     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 397       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 398     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 399       return 4; // FLAT_SCRATCH, VCC (in that order).
 400   }
 401
 402   if (isXNACKEnabled())
 403     return 4; // XNACK, VCC (in that order).
 404   return 2; // VCC.
 405 }
 406
 407 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 408   const Function &F = *MF.getFunction();
 409   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 410
 411   // Compute maximum number of SGPRs function can use using default/requested
 412   // minimum number of waves per execution unit.
 413   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 414   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 415   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 416
 417   // Check if maximum number of SGPRs was explicitly requested using
 418   // "amdgpu-num-sgpr" attribute.
 419   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 420     unsigned Requested = AMDGPU::getIntegerAttribute(
 421       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 422
 423     // Make sure requested value does not violate subtarget's specifications.
 424     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 425       Requested = 0;
 426
 427     // If more SGPRs are required to support the input user/system SGPRs,
 428     // increase to accommodate them.
 429     //
 430     // FIXME: This really ends up using the requested number of SGPRs + number
 431     // of reserved special registers in total. Theoretically you could re-use
 432     // the last input registers for these special registers, but this would
 433     // require a lot of complexity to deal with the weird aliasing.
 434     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 435     if (Requested && Requested < InputNumSGPRs)
 436       Requested = InputNumSGPRs;
 437
 438     // Make sure requested value is compatible with values implied by
 439     // default/requested minimum/maximum number of waves per execution unit.
 440     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 441       Requested = 0;
 442     if (WavesPerEU.second &&
 443         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 444       Requested = 0;
 445
 446     if (Requested)
 447       MaxNumSGPRs = Requested;
 448   }
 449
 450   if (hasSGPRInitBug())
 451     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 452
 453   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 454                   MaxAddressableNumSGPRs);
 455 }
 456
 457 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 458   const Function &F = *MF.getFunction();
 459   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 460
 461   // Compute maximum number of VGPRs function can use using default/requested
 462   // minimum number of waves per execution unit.
 463   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 464   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 465
 466   // Check if maximum number of VGPRs was explicitly requested using
 467   // "amdgpu-num-vgpr" attribute.
 468   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 469     unsigned Requested = AMDGPU::getIntegerAttribute(
 470       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 471
 472     // Make sure requested value does not violate subtarget's specifications.
 473     if (Requested && Requested <= getReservedNumVGPRs(MF))
 474       Requested = 0;
 475
 476     // Make sure requested value is compatible with values implied by
 477     // default/requested minimum/maximum number of waves per execution unit.
 478     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 479       Requested = 0;
 480     if (WavesPerEU.second &&
 481         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 482       Requested = 0;
 483
 484     if (Requested)
 485       MaxNumVGPRs = Requested;
 486   }
 487
 488   return MaxNumVGPRs - getReservedNumVGPRs(MF);
 489 }