contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

   1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //==-----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
  16 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
  17
  18 #include "AMDGPU.h"
  19 #include "R600InstrInfo.h"
  20 #include "R600ISelLowering.h"
  21 #include "R600FrameLowering.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIISelLowering.h"
  24 #include "SIFrameLowering.h"
  25 #include "Utils/AMDGPUBaseInfo.h"
  26 #include "llvm/ADT/Triple.h"
  27 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
  28 #include "llvm/CodeGen/MachineFunction.h"
  29 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
  30 #include "llvm/MC/MCInstrItineraries.h"
  31 #include "llvm/Support/MathExtras.h"
  32 #include <cassert>
  33 #include <cstdint>
  34 #include <memory>
  35 #include <utility>
  36
  37 #define GET_SUBTARGETINFO_HEADER
  38 #include "AMDGPUGenSubtargetInfo.inc"
  39
  40 namespace llvm {
  41
  42 class StringRef;
  43
  44 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
  45 public:
  46   enum Generation {
  47     R600 = 0,
  48     R700,
  49     EVERGREEN,
  50     NORTHERN_ISLANDS,
  51     SOUTHERN_ISLANDS,
  52     SEA_ISLANDS,
  53     VOLCANIC_ISLANDS,
  54   };
  55
  56   enum {
  57     ISAVersion0_0_0,
  58     ISAVersion7_0_0,
  59     ISAVersion7_0_1,
  60     ISAVersion7_0_2,
  61     ISAVersion8_0_0,
  62     ISAVersion8_0_1,
  63     ISAVersion8_0_2,
  64     ISAVersion8_0_3,
  65     ISAVersion8_0_4,
  66     ISAVersion8_1_0,
  67   };
  68
  69 protected:
  70   // Basic subtarget description.
  71   Triple TargetTriple;
  72   Generation Gen;
  73   unsigned IsaVersion;
  74   unsigned WavefrontSize;
  75   int LocalMemorySize;
  76   int LDSBankCount;
  77   unsigned MaxPrivateElementSize;
  78
  79   // Possibly statically set by tablegen, but may want to be overridden.
  80   bool FastFMAF32;
  81   bool HalfRate64Ops;
  82
  83   // Dynamially set bits that enable features.
  84   bool FP16Denormals;
  85   bool FP32Denormals;
  86   bool FP64Denormals;
  87   bool FPExceptions;
  88   bool FlatForGlobal;
  89   bool UnalignedScratchAccess;
  90   bool UnalignedBufferAccess;
  91   bool EnableXNACK;
  92   bool DebuggerInsertNops;
  93   bool DebuggerReserveRegs;
  94   bool DebuggerEmitPrologue;
  95
  96   // Used as options.
  97   bool EnableVGPRSpilling;
  98   bool EnablePromoteAlloca;
  99   bool EnableLoadStoreOpt;
 100   bool EnableUnsafeDSOffsetFolding;
 101   bool EnableSIScheduler;
 102   bool DumpCode;
 103
 104   // Subtarget statically properties set by tablegen
 105   bool FP64;
 106   bool IsGCN;
 107   bool GCN1Encoding;
 108   bool GCN3Encoding;
 109   bool CIInsts;
 110   bool SGPRInitBug;
 111   bool HasSMemRealTime;
 112   bool Has16BitInsts;
 113   bool HasMovrel;
 114   bool HasVGPRIndexMode;
 115   bool HasScalarStores;
 116   bool HasInv2PiInlineImm;
 117   bool FlatAddressSpace;
 118   bool R600ALUInst;
 119   bool CaymanISA;
 120   bool CFALUBug;
 121   bool HasVertexCache;
 122   short TexVTXClauseSize;
 123   bool ScalarizeGlobal;
 124
 125   // Dummy feature to use for assembler in tablegen.
 126   bool FeatureDisable;
 127
 128   InstrItineraryData InstrItins;
 129   SelectionDAGTargetInfo TSInfo;
 130
 131 public:
 132   AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 133                   const TargetMachine &TM);
 134   ~AMDGPUSubtarget() override;
 135
 136   AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
 137                                                    StringRef GPU, StringRef FS);
 138
 139   const AMDGPUInstrInfo *getInstrInfo() const override = 0;
 140   const AMDGPUFrameLowering *getFrameLowering() const override = 0;
 141   const AMDGPUTargetLowering *getTargetLowering() const override = 0;
 142   const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
 143
 144   const InstrItineraryData *getInstrItineraryData() const override {
 145     return &InstrItins;
 146   }
 147
 148   // Nothing implemented, just prevent crashes on use.
 149   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
 150     return &TSInfo;
 151   }
 152
 153   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 154
 155   bool isAmdHsaOS() const {
 156     return TargetTriple.getOS() == Triple::AMDHSA;
 157   }
 158
 159   bool isMesa3DOS() const {
 160     return TargetTriple.getOS() == Triple::Mesa3D;
 161   }
 162
 163   bool isOpenCLEnv() const {
 164     return TargetTriple.getEnvironment() == Triple::OpenCL;
 165   }
 166
 167   Generation getGeneration() const {
 168     return Gen;
 169   }
 170
 171   unsigned getWavefrontSize() const {
 172     return WavefrontSize;
 173   }
 174
 175   int getLocalMemorySize() const {
 176     return LocalMemorySize;
 177   }
 178
 179   int getLDSBankCount() const {
 180     return LDSBankCount;
 181   }
 182
 183   unsigned getMaxPrivateElementSize() const {
 184     return MaxPrivateElementSize;
 185   }
 186
 187   bool has16BitInsts() const {
 188     return Has16BitInsts;
 189   }
 190
 191   bool hasHWFP64() const {
 192     return FP64;
 193   }
 194
 195   bool hasFastFMAF32() const {
 196     return FastFMAF32;
 197   }
 198
 199   bool hasHalfRate64Ops() const {
 200     return HalfRate64Ops;
 201   }
 202
 203   bool hasAddr64() const {
 204     return (getGeneration() < VOLCANIC_ISLANDS);
 205   }
 206
 207   bool hasBFE() const {
 208     return (getGeneration() >= EVERGREEN);
 209   }
 210
 211   bool hasBFI() const {
 212     return (getGeneration() >= EVERGREEN);
 213   }
 214
 215   bool hasBFM() const {
 216     return hasBFE();
 217   }
 218
 219   bool hasBCNT(unsigned Size) const {
 220     if (Size == 32)
 221       return (getGeneration() >= EVERGREEN);
 222
 223     if (Size == 64)
 224       return (getGeneration() >= SOUTHERN_ISLANDS);
 225
 226     return false;
 227   }
 228
 229   bool hasMulU24() const {
 230     return (getGeneration() >= EVERGREEN);
 231   }
 232
 233   bool hasMulI24() const {
 234     return (getGeneration() >= SOUTHERN_ISLANDS ||
 235             hasCaymanISA());
 236   }
 237
 238   bool hasFFBL() const {
 239     return (getGeneration() >= EVERGREEN);
 240   }
 241
 242   bool hasFFBH() const {
 243     return (getGeneration() >= EVERGREEN);
 244   }
 245
 246   bool hasCARRY() const {
 247     return (getGeneration() >= EVERGREEN);
 248   }
 249
 250   bool hasBORROW() const {
 251     return (getGeneration() >= EVERGREEN);
 252   }
 253
 254   bool hasCaymanISA() const {
 255     return CaymanISA;
 256   }
 257
 258   bool isPromoteAllocaEnabled() const {
 259     return EnablePromoteAlloca;
 260   }
 261
 262   bool unsafeDSOffsetFoldingEnabled() const {
 263     return EnableUnsafeDSOffsetFolding;
 264   }
 265
 266   bool dumpCode() const {
 267     return DumpCode;
 268   }
 269
 270   bool enableIEEEBit(const MachineFunction &MF) const {
 271     return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
 272   }
 273
 274   /// Return the amount of LDS that can be used that will not restrict the
 275   /// occupancy lower than WaveCount.
 276   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
 277
 278   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
 279   /// the given LDS memory size is the only constraint.
 280   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
 281
 282   bool hasFP16Denormals() const {
 283     return FP16Denormals;
 284   }
 285
 286   bool hasFP32Denormals() const {
 287     return FP32Denormals;
 288   }
 289
 290   bool hasFP64Denormals() const {
 291     return FP64Denormals;
 292   }
 293
 294   bool hasFPExceptions() const {
 295     return FPExceptions;
 296   }
 297
 298   bool useFlatForGlobal() const {
 299     return FlatForGlobal;
 300   }
 301
 302   bool hasUnalignedBufferAccess() const {
 303     return UnalignedBufferAccess;
 304   }
 305
 306   bool hasUnalignedScratchAccess() const {
 307     return UnalignedScratchAccess;
 308   }
 309
 310   bool isXNACKEnabled() const {
 311     return EnableXNACK;
 312   }
 313
 314   bool isMesaKernel(const MachineFunction &MF) const {
 315     return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
 316   }
 317
 318   // Covers VS/PS/CS graphics shaders
 319   bool isMesaGfxShader(const MachineFunction &MF) const {
 320     return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv());
 321   }
 322
 323   bool isAmdCodeObjectV2(const MachineFunction &MF) const {
 324     return isAmdHsaOS() || isMesaKernel(MF);
 325   }
 326
 327   /// \brief Returns the offset in bytes from the start of the input buffer
 328   ///        of the first explicit kernel argument.
 329   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
 330     return isAmdCodeObjectV2(MF) ? 0 : 36;
 331   }
 332
 333   unsigned getAlignmentForImplicitArgPtr() const {
 334     return isAmdHsaOS() ? 8 : 4;
 335   }
 336
 337   unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
 338     if (isMesaKernel(MF))
 339       return 16;
 340     if (isAmdHsaOS() && isOpenCLEnv())
 341       return 32;
 342     return 0;
 343   }
 344
 345   unsigned getStackAlignment() const {
 346     // Scratch is allocated in 256 dword per wave blocks.
 347     return 4 * 256 / getWavefrontSize();
 348   }
 349
 350   bool enableMachineScheduler() const override {
 351     return true;
 352   }
 353
 354   bool enableSubRegLiveness() const override {
 355     return true;
 356   }
 357
 358   /// \returns Number of execution units per compute unit supported by the
 359   /// subtarget.
 360   unsigned getEUsPerCU() const {
 361     return 4;
 362   }
 363
 364   /// \returns Maximum number of work groups per compute unit supported by the
 365   /// subtarget and limited by given flat work group size.
 366   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
 367     if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
 368       return 8;
 369     return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
 370   }
 371
 372   /// \returns Maximum number of waves per compute unit supported by the
 373   /// subtarget without any kind of limitation.
 374   unsigned getMaxWavesPerCU() const {
 375     return getMaxWavesPerEU() * getEUsPerCU();
 376   }
 377
 378   /// \returns Maximum number of waves per compute unit supported by the
 379   /// subtarget and limited by given flat work group size.
 380   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
 381     return getWavesPerWorkGroup(FlatWorkGroupSize);
 382   }
 383
 384   /// \returns Minimum number of waves per execution unit supported by the
 385   /// subtarget.
 386   unsigned getMinWavesPerEU() const {
 387     return 1;
 388   }
 389
 390   /// \returns Maximum number of waves per execution unit supported by the
 391   /// subtarget without any kind of limitation.
 392   unsigned getMaxWavesPerEU() const {
 393     if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
 394       return 8;
 395     // FIXME: Need to take scratch memory into account.
 396     return 10;
 397   }
 398
 399   /// \returns Maximum number of waves per execution unit supported by the
 400   /// subtarget and limited by given flat work group size.
 401   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
 402     return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
 403       getEUsPerCU();
 404   }
 405
 406   /// \returns Minimum flat work group size supported by the subtarget.
 407   unsigned getMinFlatWorkGroupSize() const {
 408     return 1;
 409   }
 410
 411   /// \returns Maximum flat work group size supported by the subtarget.
 412   unsigned getMaxFlatWorkGroupSize() const {
 413     return 2048;
 414   }
 415
 416   /// \returns Number of waves per work group given the flat work group size.
 417   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
 418     return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
 419   }
 420
 421   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
 422   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
 423
 424   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
 425   /// for function \p F, or minimum/maximum flat work group sizes explicitly
 426   /// requested using "amdgpu-flat-work-group-size" attribute attached to
 427   /// function \p F.
 428   ///
 429   /// \returns Subtarget's default values if explicitly requested values cannot
 430   /// be converted to integer, or violate subtarget's specifications.
 431   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
 432
 433   /// \returns Subtarget's default pair of minimum/maximum number of waves per
 434   /// execution unit for function \p F, or minimum/maximum number of waves per
 435   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
 436   /// attached to function \p F.
 437   ///
 438   /// \returns Subtarget's default values if explicitly requested values cannot
 439   /// be converted to integer, violate subtarget's specifications, or are not
 440   /// compatible with minimum/maximum number of waves limited by flat work group
 441   /// size, register usage, and/or lds usage.
 442   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
 443 };
 444
 445 class R600Subtarget final : public AMDGPUSubtarget {
 446 private:
 447   R600InstrInfo InstrInfo;
 448   R600FrameLowering FrameLowering;
 449   R600TargetLowering TLInfo;
 450
 451 public:
 452   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
 453                 const TargetMachine &TM);
 454
 455   const R600InstrInfo *getInstrInfo() const override {
 456     return &InstrInfo;
 457   }
 458
 459   const R600FrameLowering *getFrameLowering() const override {
 460     return &FrameLowering;
 461   }
 462
 463   const R600TargetLowering *getTargetLowering() const override {
 464     return &TLInfo;
 465   }
 466
 467   const R600RegisterInfo *getRegisterInfo() const override {
 468     return &InstrInfo.getRegisterInfo();
 469   }
 470
 471   bool hasCFAluBug() const {
 472     return CFALUBug;
 473   }
 474
 475   bool hasVertexCache() const {
 476     return HasVertexCache;
 477   }
 478
 479   short getTexVTXClauseSize() const {
 480     return TexVTXClauseSize;
 481   }
 482 };
 483
 484 class SISubtarget final : public AMDGPUSubtarget {
 485 public:
 486   enum {
 487     // The closed Vulkan driver sets 96, which limits the wave count to 8 but
 488     // doesn't spill SGPRs as much as when 80 is set.
 489     FIXED_SGPR_COUNT_FOR_INIT_BUG = 96
 490   };
 491
 492 private:
 493   SIInstrInfo InstrInfo;
 494   SIFrameLowering FrameLowering;
 495   SITargetLowering TLInfo;
 496   std::unique_ptr<GISelAccessor> GISel;
 497
 498 public:
 499   SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
 500               const TargetMachine &TM);
 501
 502   const SIInstrInfo *getInstrInfo() const override {
 503     return &InstrInfo;
 504   }
 505
 506   const SIFrameLowering *getFrameLowering() const override {
 507     return &FrameLowering;
 508   }
 509
 510   const SITargetLowering *getTargetLowering() const override {
 511     return &TLInfo;
 512   }
 513
 514   const CallLowering *getCallLowering() const override {
 515     assert(GISel && "Access to GlobalISel APIs not set");
 516     return GISel->getCallLowering();
 517   }
 518
 519   const SIRegisterInfo *getRegisterInfo() const override {
 520     return &InstrInfo.getRegisterInfo();
 521   }
 522
 523   void setGISelAccessor(GISelAccessor &GISel) {
 524     this->GISel.reset(&GISel);
 525   }
 526
 527   void overrideSchedPolicy(MachineSchedPolicy &Policy,
 528                            unsigned NumRegionInstrs) const override;
 529
 530   bool isVGPRSpillingEnabled(const Function& F) const;
 531
 532   unsigned getMaxNumUserSGPRs() const {
 533     return 16;
 534   }
 535
 536   bool hasFlatAddressSpace() const {
 537     return FlatAddressSpace;
 538   }
 539
 540   bool hasSMemRealTime() const {
 541     return HasSMemRealTime;
 542   }
 543
 544   bool hasMovrel() const {
 545     return HasMovrel;
 546   }
 547
 548   bool hasVGPRIndexMode() const {
 549     return HasVGPRIndexMode;
 550   }
 551
 552   bool hasScalarCompareEq64() const {
 553     return getGeneration() >= VOLCANIC_ISLANDS;
 554   }
 555
 556   bool hasScalarStores() const {
 557     return HasScalarStores;
 558   }
 559
 560   bool hasInv2PiInlineImm() const {
 561     return HasInv2PiInlineImm;
 562   }
 563
 564   bool enableSIScheduler() const {
 565     return EnableSIScheduler;
 566   }
 567
 568   bool debuggerSupported() const {
 569     return debuggerInsertNops() && debuggerReserveRegs() &&
 570       debuggerEmitPrologue();
 571   }
 572
 573   bool debuggerInsertNops() const {
 574     return DebuggerInsertNops;
 575   }
 576
 577   bool debuggerReserveRegs() const {
 578     return DebuggerReserveRegs;
 579   }
 580
 581   bool debuggerEmitPrologue() const {
 582     return DebuggerEmitPrologue;
 583   }
 584
 585   bool loadStoreOptEnabled() const {
 586     return EnableLoadStoreOpt;
 587   }
 588
 589   bool hasSGPRInitBug() const {
 590     return SGPRInitBug;
 591   }
 592
 593   bool has12DWordStoreHazard() const {
 594     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
 595   }
 596
 597   unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
 598
 599   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
 600   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
 601
 602   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
 603   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 604
 605   /// \returns True if waitcnt instruction is needed before barrier instruction,
 606   /// false otherwise.
 607   bool needWaitcntBeforeBarrier() const {
 608     return true;
 609   }
 610
 611   unsigned getMaxNumSGPRs() const;
 612 };
 613
 614 } // end namespace llvm
 615
 616 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H