contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

   1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //==-----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
  16 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
  17
  18 #include "AMDGPU.h"
  19 #include "R600InstrInfo.h"
  20 #include "R600ISelLowering.h"
  21 #include "R600FrameLowering.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIISelLowering.h"
  24 #include "SIFrameLowering.h"
  25 #include "Utils/AMDGPUBaseInfo.h"
  26 #include "llvm/ADT/Triple.h"
  27 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
  28 #include "llvm/CodeGen/MachineFunction.h"
  29 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
  30 #include "llvm/MC/MCInstrItineraries.h"
  31 #include "llvm/Support/MathExtras.h"
  32 #include <cassert>
  33 #include <cstdint>
  34 #include <memory>
  35 #include <utility>
  36
  37 #define GET_SUBTARGETINFO_HEADER
  38 #include "AMDGPUGenSubtargetInfo.inc"
  39
  40 namespace llvm {
  41
  42 class StringRef;
  43
  44 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
  45 public:
  46   enum Generation {
  47     R600 = 0,
  48     R700,
  49     EVERGREEN,
  50     NORTHERN_ISLANDS,
  51     SOUTHERN_ISLANDS,
  52     SEA_ISLANDS,
  53     VOLCANIC_ISLANDS,
  54   };
  55
  56   enum {
  57     ISAVersion0_0_0,
  58     ISAVersion7_0_0,
  59     ISAVersion7_0_1,
  60     ISAVersion7_0_2,
  61     ISAVersion8_0_0,
  62     ISAVersion8_0_1,
  63     ISAVersion8_0_2,
  64     ISAVersion8_0_3,
  65     ISAVersion8_0_4,
  66     ISAVersion8_1_0,
  67   };
  68
  69 protected:
  70   // Basic subtarget description.
  71   Triple TargetTriple;
  72   Generation Gen;
  73   unsigned IsaVersion;
  74   unsigned WavefrontSize;
  75   int LocalMemorySize;
  76   int LDSBankCount;
  77   unsigned MaxPrivateElementSize;
  78
  79   // Possibly statically set by tablegen, but may want to be overridden.
  80   bool FastFMAF32;
  81   bool HalfRate64Ops;
  82
  83   // Dynamially set bits that enable features.
  84   bool FP16Denormals;
  85   bool FP32Denormals;
  86   bool FP64Denormals;
  87   bool FPExceptions;
  88   bool FlatForGlobal;
  89   bool UnalignedScratchAccess;
  90   bool UnalignedBufferAccess;
  91   bool EnableXNACK;
  92   bool DebuggerInsertNops;
  93   bool DebuggerReserveRegs;
  94   bool DebuggerEmitPrologue;
  95
  96   // Used as options.
  97   bool EnableVGPRSpilling;
  98   bool EnablePromoteAlloca;
  99   bool EnableLoadStoreOpt;
 100   bool EnableUnsafeDSOffsetFolding;
 101   bool EnableSIScheduler;
 102   bool DumpCode;
 103
 104   // Subtarget statically properties set by tablegen
 105   bool FP64;
 106   bool IsGCN;
 107   bool GCN1Encoding;
 108   bool GCN3Encoding;
 109   bool CIInsts;
 110   bool SGPRInitBug;
 111   bool HasSMemRealTime;
 112   bool Has16BitInsts;
 113   bool HasMovrel;
 114   bool HasVGPRIndexMode;
 115   bool HasScalarStores;
 116   bool HasInv2PiInlineImm;
 117   bool FlatAddressSpace;
 118   bool R600ALUInst;
 119   bool CaymanISA;
 120   bool CFALUBug;
 121   bool HasVertexCache;
 122   short TexVTXClauseSize;
 123   bool ScalarizeGlobal;
 124
 125   // Dummy feature to use for assembler in tablegen.
 126   bool FeatureDisable;
 127
 128   InstrItineraryData InstrItins;
 129   SelectionDAGTargetInfo TSInfo;
 130
 131 public:
 132   AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 133                   const TargetMachine &TM);
 134   ~AMDGPUSubtarget() override;
 135
 136   AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
 137                                                    StringRef GPU, StringRef FS);
 138
 139   const AMDGPUInstrInfo *getInstrInfo() const override = 0;
 140   const AMDGPUFrameLowering *getFrameLowering() const override = 0;
 141   const AMDGPUTargetLowering *getTargetLowering() const override = 0;
 142   const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
 143
 144   const InstrItineraryData *getInstrItineraryData() const override {
 145     return &InstrItins;
 146   }
 147
 148   // Nothing implemented, just prevent crashes on use.
 149   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
 150     return &TSInfo;
 151   }
 152
 153   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 154
 155   bool isAmdHsaOS() const {
 156     return TargetTriple.getOS() == Triple::AMDHSA;
 157   }
 158
 159   bool isMesa3DOS() const {
 160     return TargetTriple.getOS() == Triple::Mesa3D;
 161   }
 162
 163   bool isOpenCLEnv() const {
 164     return TargetTriple.getEnvironment() == Triple::OpenCL;
 165   }
 166
 167   Generation getGeneration() const {
 168     return Gen;
 169   }
 170
 171   unsigned getWavefrontSize() const {
 172     return WavefrontSize;
 173   }
 174
 175   int getLocalMemorySize() const {
 176     return LocalMemorySize;
 177   }
 178
 179   int getLDSBankCount() const {
 180     return LDSBankCount;
 181   }
 182
 183   unsigned getMaxPrivateElementSize() const {
 184     return MaxPrivateElementSize;
 185   }
 186
 187   bool has16BitInsts() const {
 188     return Has16BitInsts;
 189   }
 190
 191   bool hasHWFP64() const {
 192     return FP64;
 193   }
 194
 195   bool hasFastFMAF32() const {
 196     return FastFMAF32;
 197   }
 198
 199   bool hasHalfRate64Ops() const {
 200     return HalfRate64Ops;
 201   }
 202
 203   bool hasAddr64() const {
 204     return (getGeneration() < VOLCANIC_ISLANDS);
 205   }
 206
 207   bool hasBFE() const {
 208     return (getGeneration() >= EVERGREEN);
 209   }
 210
 211   bool hasBFI() const {
 212     return (getGeneration() >= EVERGREEN);
 213   }
 214
 215   bool hasBFM() const {
 216     return hasBFE();
 217   }
 218
 219   bool hasBCNT(unsigned Size) const {
 220     if (Size == 32)
 221       return (getGeneration() >= EVERGREEN);
 222
 223     if (Size == 64)
 224       return (getGeneration() >= SOUTHERN_ISLANDS);
 225
 226     return false;
 227   }
 228
 229   bool hasMulU24() const {
 230     return (getGeneration() >= EVERGREEN);
 231   }
 232
 233   bool hasMulI24() const {
 234     return (getGeneration() >= SOUTHERN_ISLANDS ||
 235             hasCaymanISA());
 236   }
 237
 238   bool hasFFBL() const {
 239     return (getGeneration() >= EVERGREEN);
 240   }
 241
 242   bool hasFFBH() const {
 243     return (getGeneration() >= EVERGREEN);
 244   }
 245
 246   bool hasCARRY() const {
 247     return (getGeneration() >= EVERGREEN);
 248   }
 249
 250   bool hasBORROW() const {
 251     return (getGeneration() >= EVERGREEN);
 252   }
 253
 254   bool hasCaymanISA() const {
 255     return CaymanISA;
 256   }
 257
 258   bool isPromoteAllocaEnabled() const {
 259     return EnablePromoteAlloca;
 260   }
 261
 262   bool unsafeDSOffsetFoldingEnabled() const {
 263     return EnableUnsafeDSOffsetFolding;
 264   }
 265
 266   bool dumpCode() const {
 267     return DumpCode;
 268   }
 269
 270   bool enableIEEEBit(const MachineFunction &MF) const {
 271     return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
 272   }
 273
 274   /// Return the amount of LDS that can be used that will not restrict the
 275   /// occupancy lower than WaveCount.
 276   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
 277
 278   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
 279   /// the given LDS memory size is the only constraint.
 280   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
 281
 282   bool hasFP16Denormals() const {
 283     return FP16Denormals;
 284   }
 285
 286   bool hasFP32Denormals() const {
 287     return FP32Denormals;
 288   }
 289
 290   bool hasFP64Denormals() const {
 291     return FP64Denormals;
 292   }
 293
 294   bool hasFPExceptions() const {
 295     return FPExceptions;
 296   }
 297
 298   bool useFlatForGlobal() const {
 299     return FlatForGlobal;
 300   }
 301
 302   bool hasUnalignedBufferAccess() const {
 303     return UnalignedBufferAccess;
 304   }
 305
 306   bool hasUnalignedScratchAccess() const {
 307     return UnalignedScratchAccess;
 308   }
 309
 310   bool isXNACKEnabled() const {
 311     return EnableXNACK;
 312   }
 313
 314   bool isAmdCodeObjectV2() const {
 315     return isAmdHsaOS() || isMesa3DOS();
 316   }
 317
 318   /// \brief Returns the offset in bytes from the start of the input buffer
 319   ///        of the first explicit kernel argument.
 320   unsigned getExplicitKernelArgOffset() const {
 321     return isAmdCodeObjectV2() ? 0 : 36;
 322   }
 323
 324   unsigned getAlignmentForImplicitArgPtr() const {
 325     return isAmdHsaOS() ? 8 : 4;
 326   }
 327
 328   unsigned getImplicitArgNumBytes() const {
 329     if (isMesa3DOS())
 330       return 16;
 331     if (isAmdHsaOS() && isOpenCLEnv())
 332       return 32;
 333     return 0;
 334   }
 335
 336   unsigned getStackAlignment() const {
 337     // Scratch is allocated in 256 dword per wave blocks.
 338     return 4 * 256 / getWavefrontSize();
 339   }
 340
 341   bool enableMachineScheduler() const override {
 342     return true;
 343   }
 344
 345   bool enableSubRegLiveness() const override {
 346     return true;
 347   }
 348
 349   /// \returns Number of execution units per compute unit supported by the
 350   /// subtarget.
 351   unsigned getEUsPerCU() const {
 352     return 4;
 353   }
 354
 355   /// \returns Maximum number of work groups per compute unit supported by the
 356   /// subtarget and limited by given flat work group size.
 357   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
 358     if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
 359       return 8;
 360     return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
 361   }
 362
 363   /// \returns Maximum number of waves per compute unit supported by the
 364   /// subtarget without any kind of limitation.
 365   unsigned getMaxWavesPerCU() const {
 366     return getMaxWavesPerEU() * getEUsPerCU();
 367   }
 368
 369   /// \returns Maximum number of waves per compute unit supported by the
 370   /// subtarget and limited by given flat work group size.
 371   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
 372     return getWavesPerWorkGroup(FlatWorkGroupSize);
 373   }
 374
 375   /// \returns Minimum number of waves per execution unit supported by the
 376   /// subtarget.
 377   unsigned getMinWavesPerEU() const {
 378     return 1;
 379   }
 380
 381   /// \returns Maximum number of waves per execution unit supported by the
 382   /// subtarget without any kind of limitation.
 383   unsigned getMaxWavesPerEU() const {
 384     if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
 385       return 8;
 386     // FIXME: Need to take scratch memory into account.
 387     return 10;
 388   }
 389
 390   /// \returns Maximum number of waves per execution unit supported by the
 391   /// subtarget and limited by given flat work group size.
 392   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
 393     return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
 394       getEUsPerCU();
 395   }
 396
 397   /// \returns Minimum flat work group size supported by the subtarget.
 398   unsigned getMinFlatWorkGroupSize() const {
 399     return 1;
 400   }
 401
 402   /// \returns Maximum flat work group size supported by the subtarget.
 403   unsigned getMaxFlatWorkGroupSize() const {
 404     return 2048;
 405   }
 406
 407   /// \returns Number of waves per work group given the flat work group size.
 408   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
 409     return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
 410   }
 411
 412   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
 413   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
 414
 415   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
 416   /// for function \p F, or minimum/maximum flat work group sizes explicitly
 417   /// requested using "amdgpu-flat-work-group-size" attribute attached to
 418   /// function \p F.
 419   ///
 420   /// \returns Subtarget's default values if explicitly requested values cannot
 421   /// be converted to integer, or violate subtarget's specifications.
 422   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
 423
 424   /// \returns Subtarget's default pair of minimum/maximum number of waves per
 425   /// execution unit for function \p F, or minimum/maximum number of waves per
 426   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
 427   /// attached to function \p F.
 428   ///
 429   /// \returns Subtarget's default values if explicitly requested values cannot
 430   /// be converted to integer, violate subtarget's specifications, or are not
 431   /// compatible with minimum/maximum number of waves limited by flat work group
 432   /// size, register usage, and/or lds usage.
 433   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
 434 };
 435
 436 class R600Subtarget final : public AMDGPUSubtarget {
 437 private:
 438   R600InstrInfo InstrInfo;
 439   R600FrameLowering FrameLowering;
 440   R600TargetLowering TLInfo;
 441
 442 public:
 443   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
 444                 const TargetMachine &TM);
 445
 446   const R600InstrInfo *getInstrInfo() const override {
 447     return &InstrInfo;
 448   }
 449
 450   const R600FrameLowering *getFrameLowering() const override {
 451     return &FrameLowering;
 452   }
 453
 454   const R600TargetLowering *getTargetLowering() const override {
 455     return &TLInfo;
 456   }
 457
 458   const R600RegisterInfo *getRegisterInfo() const override {
 459     return &InstrInfo.getRegisterInfo();
 460   }
 461
 462   bool hasCFAluBug() const {
 463     return CFALUBug;
 464   }
 465
 466   bool hasVertexCache() const {
 467     return HasVertexCache;
 468   }
 469
 470   short getTexVTXClauseSize() const {
 471     return TexVTXClauseSize;
 472   }
 473 };
 474
 475 class SISubtarget final : public AMDGPUSubtarget {
 476 public:
 477   enum {
 478     // The closed Vulkan driver sets 96, which limits the wave count to 8 but
 479     // doesn't spill SGPRs as much as when 80 is set.
 480     FIXED_SGPR_COUNT_FOR_INIT_BUG = 96
 481   };
 482
 483 private:
 484   SIInstrInfo InstrInfo;
 485   SIFrameLowering FrameLowering;
 486   SITargetLowering TLInfo;
 487   std::unique_ptr<GISelAccessor> GISel;
 488
 489 public:
 490   SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
 491               const TargetMachine &TM);
 492
 493   const SIInstrInfo *getInstrInfo() const override {
 494     return &InstrInfo;
 495   }
 496
 497   const SIFrameLowering *getFrameLowering() const override {
 498     return &FrameLowering;
 499   }
 500
 501   const SITargetLowering *getTargetLowering() const override {
 502     return &TLInfo;
 503   }
 504
 505   const CallLowering *getCallLowering() const override {
 506     assert(GISel && "Access to GlobalISel APIs not set");
 507     return GISel->getCallLowering();
 508   }
 509
 510   const SIRegisterInfo *getRegisterInfo() const override {
 511     return &InstrInfo.getRegisterInfo();
 512   }
 513
 514   void setGISelAccessor(GISelAccessor &GISel) {
 515     this->GISel.reset(&GISel);
 516   }
 517
 518   void overrideSchedPolicy(MachineSchedPolicy &Policy,
 519                            unsigned NumRegionInstrs) const override;
 520
 521   bool isVGPRSpillingEnabled(const Function& F) const;
 522
 523   unsigned getMaxNumUserSGPRs() const {
 524     return 16;
 525   }
 526
 527   bool hasFlatAddressSpace() const {
 528     return FlatAddressSpace;
 529   }
 530
 531   bool hasSMemRealTime() const {
 532     return HasSMemRealTime;
 533   }
 534
 535   bool hasMovrel() const {
 536     return HasMovrel;
 537   }
 538
 539   bool hasVGPRIndexMode() const {
 540     return HasVGPRIndexMode;
 541   }
 542
 543   bool hasScalarCompareEq64() const {
 544     return getGeneration() >= VOLCANIC_ISLANDS;
 545   }
 546
 547   bool hasScalarStores() const {
 548     return HasScalarStores;
 549   }
 550
 551   bool hasInv2PiInlineImm() const {
 552     return HasInv2PiInlineImm;
 553   }
 554
 555   bool enableSIScheduler() const {
 556     return EnableSIScheduler;
 557   }
 558
 559   bool debuggerSupported() const {
 560     return debuggerInsertNops() && debuggerReserveRegs() &&
 561       debuggerEmitPrologue();
 562   }
 563
 564   bool debuggerInsertNops() const {
 565     return DebuggerInsertNops;
 566   }
 567
 568   bool debuggerReserveRegs() const {
 569     return DebuggerReserveRegs;
 570   }
 571
 572   bool debuggerEmitPrologue() const {
 573     return DebuggerEmitPrologue;
 574   }
 575
 576   bool loadStoreOptEnabled() const {
 577     return EnableLoadStoreOpt;
 578   }
 579
 580   bool hasSGPRInitBug() const {
 581     return SGPRInitBug;
 582   }
 583
 584   bool has12DWordStoreHazard() const {
 585     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
 586   }
 587
 588   unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
 589
 590   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
 591   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
 592
 593   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
 594   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 595
 596   /// \returns True if waitcnt instruction is needed before barrier instruction,
 597   /// false otherwise.
 598   bool needWaitcntBeforeBarrier() const {
 599     return true;
 600   }
 601
 602   unsigned getMaxNumSGPRs() const;
 603 };
 604
 605 } // end namespace llvm
 606
 607 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H