contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPUSubtarget.h"
  16 #include "llvm/ADT/SmallString.h"
  17 #include "llvm/CodeGen/MachineScheduler.h"
  18 #include "llvm/Target/TargetFrameLowering.h"
  19 #include <algorithm>
  20
  21 using namespace llvm;
  22
  23 #define DEBUG_TYPE "amdgpu-subtarget"
  24
  25 #define GET_SUBTARGETINFO_ENUM
  26 #define GET_SUBTARGETINFO_TARGET_DESC
  27 #define GET_SUBTARGETINFO_CTOR
  28 #include "AMDGPUGenSubtargetInfo.inc"
  29
  30 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
  31
  32 AMDGPUSubtarget &
  33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
  34                                                  StringRef GPU, StringRef FS) {
  35   // Determine default and user-specified characteristics
  36   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  37   // enabled, but some instructions do not respect them and they run at the
  38   // double precision rate, so don't enable by default.
  39   //
  40   // We want to be able to turn these off, but making this a subtarget feature
  41   // for SI has the unhelpful behavior that it unsets everything else if you
  42   // disable it.
  43
  44   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
  45   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  46     FullFS += "+flat-for-global,+unaligned-buffer-access,";
  47   FullFS += FS;
  48
  49   ParseSubtargetFeatures(GPU, FullFS);
  50
  51   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
  52   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
  53   // variants of MUBUF instructions.
  54   if (!hasAddr64() && !FS.contains("flat-for-global")) {
  55     FlatForGlobal = true;
  56   }
  57
  58   // FIXME: I don't think think Evergreen has any useful support for
  59   // denormals, but should be checked. Should we issue a warning somewhere
  60   // if someone tries to enable these?
  61   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  62     FP16Denormals = false;
  63     FP32Denormals = false;
  64     FP64Denormals = false;
  65   }
  66
  67   // Set defaults if needed.
  68   if (MaxPrivateElementSize == 0)
  69     MaxPrivateElementSize = 4;
  70
  71   return *this;
  72 }
  73
  74 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
  75                                  const TargetMachine &TM)
  76   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
  77     TargetTriple(TT),
  78     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
  79     IsaVersion(ISAVersion0_0_0),
  80     WavefrontSize(64),
  81     LocalMemorySize(0),
  82     LDSBankCount(0),
  83     MaxPrivateElementSize(0),
  84
  85     FastFMAF32(false),
  86     HalfRate64Ops(false),
  87
  88     FP16Denormals(false),
  89     FP32Denormals(false),
  90     FP64Denormals(false),
  91     FPExceptions(false),
  92     FlatForGlobal(false),
  93     UnalignedScratchAccess(false),
  94     UnalignedBufferAccess(false),
  95
  96     EnableXNACK(false),
  97     DebuggerInsertNops(false),
  98     DebuggerReserveRegs(false),
  99     DebuggerEmitPrologue(false),
 100
 101     EnableVGPRSpilling(false),
 102     EnablePromoteAlloca(false),
 103     EnableLoadStoreOpt(false),
 104     EnableUnsafeDSOffsetFolding(false),
 105     EnableSIScheduler(false),
 106     DumpCode(false),
 107
 108     FP64(false),
 109     IsGCN(false),
 110     GCN1Encoding(false),
 111     GCN3Encoding(false),
 112     CIInsts(false),
 113     SGPRInitBug(false),
 114     HasSMemRealTime(false),
 115     Has16BitInsts(false),
 116     HasMovrel(false),
 117     HasVGPRIndexMode(false),
 118     HasScalarStores(false),
 119     HasInv2PiInlineImm(false),
 120     FlatAddressSpace(false),
 121
 122     R600ALUInst(false),
 123     CaymanISA(false),
 124     CFALUBug(false),
 125     HasVertexCache(false),
 126     TexVTXClauseSize(0),
 127     ScalarizeGlobal(false),
 128
 129     FeatureDisable(false),
 130     InstrItins(getInstrItineraryForCPU(GPU)) {
 131   initializeSubtargetDependencies(TT, GPU, FS);
 132 }
 133
 134 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
 135 // size?
 136 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
 137   switch (NWaves) {
 138   case 10:
 139     return 1638;
 140   case 9:
 141     return 1820;
 142   case 8:
 143     return 2048;
 144   case 7:
 145     return 2340;
 146   case 6:
 147     return 2730;
 148   case 5:
 149     return 3276;
 150   case 4:
 151     return 4096;
 152   case 3:
 153     return 5461;
 154   case 2:
 155     return 8192;
 156   default:
 157     return getLocalMemorySize();
 158   }
 159 }
 160
 161 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
 162   if (Bytes <= 1638)
 163     return 10;
 164
 165   if (Bytes <= 1820)
 166     return 9;
 167
 168   if (Bytes <= 2048)
 169     return 8;
 170
 171   if (Bytes <= 2340)
 172     return 7;
 173
 174   if (Bytes <= 2730)
 175     return 6;
 176
 177   if (Bytes <= 3276)
 178     return 5;
 179
 180   if (Bytes <= 4096)
 181     return 4;
 182
 183   if (Bytes <= 5461)
 184     return 3;
 185
 186   if (Bytes <= 8192)
 187     return 2;
 188
 189   return 1;
 190 }
 191
 192 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 193   const Function &F) const {
 194   // Default minimum/maximum flat work group sizes.
 195   std::pair<unsigned, unsigned> Default =
 196     AMDGPU::isCompute(F.getCallingConv()) ?
 197       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
 198                                     getWavefrontSize() * 4) :
 199       std::pair<unsigned, unsigned>(1, getWavefrontSize());
 200
 201   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 202   // starts using "amdgpu-flat-work-group-size" attribute.
 203   Default.second = AMDGPU::getIntegerAttribute(
 204     F, "amdgpu-max-work-group-size", Default.second);
 205   Default.first = std::min(Default.first, Default.second);
 206
 207   // Requested minimum/maximum flat work group sizes.
 208   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 209     F, "amdgpu-flat-work-group-size", Default);
 210
 211   // Make sure requested minimum is less than requested maximum.
 212   if (Requested.first > Requested.second)
 213     return Default;
 214
 215   // Make sure requested values do not violate subtarget's specifications.
 216   if (Requested.first < getMinFlatWorkGroupSize())
 217     return Default;
 218   if (Requested.second > getMaxFlatWorkGroupSize())
 219     return Default;
 220
 221   return Requested;
 222 }
 223
 224 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 225   const Function &F) const {
 226   // Default minimum/maximum number of waves per execution unit.
 227   std::pair<unsigned, unsigned> Default(1, 0);
 228
 229   // Default/requested minimum/maximum flat work group sizes.
 230   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 231
 232   // If minimum/maximum flat work group sizes were explicitly requested using
 233   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 234   // number of waves per execution unit to values implied by requested
 235   // minimum/maximum flat work group sizes.
 236   unsigned MinImpliedByFlatWorkGroupSize =
 237     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 238   bool RequestedFlatWorkGroupSize = false;
 239
 240   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 241   // starts using "amdgpu-flat-work-group-size" attribute.
 242   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
 243       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 244     Default.first = MinImpliedByFlatWorkGroupSize;
 245     RequestedFlatWorkGroupSize = true;
 246   }
 247
 248   // Requested minimum/maximum number of waves per execution unit.
 249   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 250     F, "amdgpu-waves-per-eu", Default, true);
 251
 252   // Make sure requested minimum is less than requested maximum.
 253   if (Requested.second && Requested.first > Requested.second)
 254     return Default;
 255
 256   // Make sure requested values do not violate subtarget's specifications.
 257   if (Requested.first < getMinWavesPerEU() ||
 258       Requested.first > getMaxWavesPerEU())
 259     return Default;
 260   if (Requested.second > getMaxWavesPerEU())
 261     return Default;
 262
 263   // Make sure requested values are compatible with values implied by requested
 264   // minimum/maximum flat work group sizes.
 265   if (RequestedFlatWorkGroupSize &&
 266       Requested.first > MinImpliedByFlatWorkGroupSize)
 267     return Default;
 268
 269   return Requested;
 270 }
 271
 272 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 273                              const TargetMachine &TM) :
 274   AMDGPUSubtarget(TT, GPU, FS, TM),
 275   InstrInfo(*this),
 276   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 277   TLInfo(TM, *this) {}
 278
 279 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 280                          const TargetMachine &TM) :
 281   AMDGPUSubtarget(TT, GPU, FS, TM),
 282   InstrInfo(*this),
 283   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 284   TLInfo(TM, *this) {}
 285
 286 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 287                                       unsigned NumRegionInstrs) const {
 288   // Track register pressure so the scheduler can try to decrease
 289   // pressure once register usage is above the threshold defined by
 290   // SIRegisterInfo::getRegPressureSetLimit()
 291   Policy.ShouldTrackPressure = true;
 292
 293   // Enabling both top down and bottom up scheduling seems to give us less
 294   // register spills than just using one of these approaches on its own.
 295   Policy.OnlyTopDown = false;
 296   Policy.OnlyBottomUp = false;
 297
 298   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 299   if (!enableSIScheduler())
 300     Policy.ShouldTrackLaneMasks = true;
 301 }
 302
 303 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
 304   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 305 }
 306
 307 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
 308                                             unsigned ExplicitArgBytes) const {
 309   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
 310   if (ImplicitBytes == 0)
 311     return ExplicitArgBytes;
 312
 313   unsigned Alignment = getAlignmentForImplicitArgPtr();
 314   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 315 }
 316
 317 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 318   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
 319     if (SGPRs <= 80)
 320       return 10;
 321     if (SGPRs <= 88)
 322       return 9;
 323     if (SGPRs <= 100)
 324       return 8;
 325     return 7;
 326   }
 327   if (SGPRs <= 48)
 328     return 10;
 329   if (SGPRs <= 56)
 330     return 9;
 331   if (SGPRs <= 64)
 332     return 8;
 333   if (SGPRs <= 72)
 334     return 7;
 335   if (SGPRs <= 80)
 336     return 6;
 337   return 5;
 338 }
 339
 340 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 341   if (VGPRs <= 24)
 342     return 10;
 343   if (VGPRs <= 28)
 344     return 9;
 345   if (VGPRs <= 32)
 346     return 8;
 347   if (VGPRs <= 36)
 348     return 7;
 349   if (VGPRs <= 40)
 350     return 6;
 351   if (VGPRs <= 48)
 352     return 5;
 353   if (VGPRs <= 64)
 354     return 4;
 355   if (VGPRs <= 84)
 356     return 3;
 357   if (VGPRs <= 128)
 358     return 2;
 359   return 1;
 360 }
 361
 362 unsigned SISubtarget::getMaxNumSGPRs() const {
 363   if (hasSGPRInitBug())
 364     return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
 365
 366   if (getGeneration() >= VOLCANIC_ISLANDS)
 367     return 102;
 368
 369   return 104;
 370 }