contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h

   1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //==-----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// AMD GCN specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
  15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
  16
  17 #include "AMDGPUCallLowering.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "SIFrameLowering.h"
  20 #include "SIISelLowering.h"
  21 #include "SIInstrInfo.h"
  22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
  23
  24 #define GET_SUBTARGETINFO_HEADER
  25 #include "AMDGPUGenSubtargetInfo.inc"
  26
  27 namespace llvm {
  28
  29 class GCNTargetMachine;
  30
  31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
  32                            public AMDGPUSubtarget {
  33
  34   using AMDGPUSubtarget::getMaxWavesPerEU;
  35
  36 public:
  37   // Following 2 enums are documented at:
  38   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
  39   enum class TrapHandlerAbi {
  40     NONE   = 0x00,
  41     AMDHSA = 0x01,
  42   };
  43
  44   enum class TrapID {
  45     LLVMAMDHSATrap      = 0x02,
  46     LLVMAMDHSADebugTrap = 0x03,
  47   };
  48
  49 private:
  50   /// GlobalISel related APIs.
  51   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
  52   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
  53   std::unique_ptr<InstructionSelector> InstSelector;
  54   std::unique_ptr<LegalizerInfo> Legalizer;
  55   std::unique_ptr<RegisterBankInfo> RegBankInfo;
  56
  57 protected:
  58   // Basic subtarget description.
  59   Triple TargetTriple;
  60   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
  61   unsigned Gen = INVALID;
  62   InstrItineraryData InstrItins;
  63   int LDSBankCount = 0;
  64   unsigned MaxPrivateElementSize = 0;
  65
  66   // Possibly statically set by tablegen, but may want to be overridden.
  67   bool FastFMAF32 = false;
  68   bool FastDenormalF32 = false;
  69   bool HalfRate64Ops = false;
  70   bool FullRate64Ops = false;
  71
  72   // Dynamically set bits that enable features.
  73   bool FlatForGlobal = false;
  74   bool AutoWaitcntBeforeBarrier = false;
  75   bool UnalignedScratchAccess = false;
  76   bool UnalignedAccessMode = false;
  77   bool HasApertureRegs = false;
  78   bool SupportsXNACK = false;
  79
  80   // This should not be used directly. 'TargetID' tracks the dynamic settings
  81   // for XNACK.
  82   bool EnableXNACK = false;
  83
  84   bool EnableTgSplit = false;
  85   bool EnableCuMode = false;
  86   bool TrapHandler = false;
  87
  88   // Used as options.
  89   bool EnableLoadStoreOpt = false;
  90   bool EnableUnsafeDSOffsetFolding = false;
  91   bool EnableSIScheduler = false;
  92   bool EnableDS128 = false;
  93   bool EnablePRTStrictNull = false;
  94   bool DumpCode = false;
  95
  96   // Subtarget statically properties set by tablegen
  97   bool FP64 = false;
  98   bool FMA = false;
  99   bool MIMG_R128 = false;
 100   bool CIInsts = false;
 101   bool GFX8Insts = false;
 102   bool GFX9Insts = false;
 103   bool GFX90AInsts = false;
 104   bool GFX940Insts = false;
 105   bool GFX10Insts = false;
 106   bool GFX11Insts = false;
 107   bool GFX10_3Insts = false;
 108   bool GFX7GFX8GFX9Insts = false;
 109   bool SGPRInitBug = false;
 110   bool UserSGPRInit16Bug = false;
 111   bool NegativeScratchOffsetBug = false;
 112   bool NegativeUnalignedScratchOffsetBug = false;
 113   bool HasSMemRealTime = false;
 114   bool HasIntClamp = false;
 115   bool HasFmaMixInsts = false;
 116   bool HasMovrel = false;
 117   bool HasVGPRIndexMode = false;
 118   bool HasScalarStores = false;
 119   bool HasScalarAtomics = false;
 120   bool HasSDWAOmod = false;
 121   bool HasSDWAScalar = false;
 122   bool HasSDWASdst = false;
 123   bool HasSDWAMac = false;
 124   bool HasSDWAOutModsVOPC = false;
 125   bool HasDPP = false;
 126   bool HasDPP8 = false;
 127   bool Has64BitDPP = false;
 128   bool HasPackedFP32Ops = false;
 129   bool HasImageInsts = false;
 130   bool HasExtendedImageInsts = false;
 131   bool HasR128A16 = false;
 132   bool HasGFX10A16 = false;
 133   bool HasG16 = false;
 134   bool HasNSAEncoding = false;
 135   unsigned NSAMaxSize = 0;
 136   bool GFX10_AEncoding = false;
 137   bool GFX10_BEncoding = false;
 138   bool HasDLInsts = false;
 139   bool HasDot1Insts = false;
 140   bool HasDot2Insts = false;
 141   bool HasDot3Insts = false;
 142   bool HasDot4Insts = false;
 143   bool HasDot5Insts = false;
 144   bool HasDot6Insts = false;
 145   bool HasDot7Insts = false;
 146   bool HasDot8Insts = false;
 147   bool HasMAIInsts = false;
 148   bool HasPkFmacF16Inst = false;
 149   bool HasAtomicFaddRtnInsts = false;
 150   bool HasAtomicFaddNoRtnInsts = false;
 151   bool HasAtomicPkFaddNoRtnInsts = false;
 152   bool SupportsSRAMECC = false;
 153
 154   // This should not be used directly. 'TargetID' tracks the dynamic settings
 155   // for SRAMECC.
 156   bool EnableSRAMECC = false;
 157
 158   bool HasNoSdstCMPX = false;
 159   bool HasVscnt = false;
 160   bool HasGetWaveIdInst = false;
 161   bool HasSMemTimeInst = false;
 162   bool HasShaderCyclesRegister = false;
 163   bool HasVOP3Literal = false;
 164   bool HasNoDataDepHazard = false;
 165   bool FlatAddressSpace = false;
 166   bool FlatInstOffsets = false;
 167   bool FlatGlobalInsts = false;
 168   bool FlatScratchInsts = false;
 169   bool ScalarFlatScratchInsts = false;
 170   bool HasArchitectedFlatScratch = false;
 171   bool EnableFlatScratch = false;
 172   bool AddNoCarryInsts = false;
 173   bool HasUnpackedD16VMem = false;
 174   bool LDSMisalignedBug = false;
 175   bool HasMFMAInlineLiteralBug = false;
 176   bool UnalignedBufferAccess = false;
 177   bool UnalignedDSAccess = false;
 178   bool HasPackedTID = false;
 179   bool ScalarizeGlobal = false;
 180
 181   bool HasVcmpxPermlaneHazard = false;
 182   bool HasVMEMtoScalarWriteHazard = false;
 183   bool HasSMEMtoVectorWriteHazard = false;
 184   bool HasInstFwdPrefetchBug = false;
 185   bool HasVcmpxExecWARHazard = false;
 186   bool HasLdsBranchVmemWARHazard = false;
 187   bool HasNSAtoVMEMBug = false;
 188   bool HasNSAClauseBug = false;
 189   bool HasOffset3fBug = false;
 190   bool HasFlatSegmentOffsetBug = false;
 191   bool HasImageStoreD16Bug = false;
 192   bool HasImageGather4D16Bug = false;
 193   bool HasVOPDInsts = false;
 194
 195   // Dummy feature to use for assembler in tablegen.
 196   bool FeatureDisable = false;
 197
 198   SelectionDAGTargetInfo TSInfo;
 199 private:
 200   SIInstrInfo InstrInfo;
 201   SITargetLowering TLInfo;
 202   SIFrameLowering FrameLowering;
 203
 204 public:
 205   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 206                const GCNTargetMachine &TM);
 207   ~GCNSubtarget() override;
 208
 209   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
 210                                                    StringRef GPU, StringRef FS);
 211
 212   const SIInstrInfo *getInstrInfo() const override {
 213     return &InstrInfo;
 214   }
 215
 216   const SIFrameLowering *getFrameLowering() const override {
 217     return &FrameLowering;
 218   }
 219
 220   const SITargetLowering *getTargetLowering() const override {
 221     return &TLInfo;
 222   }
 223
 224   const SIRegisterInfo *getRegisterInfo() const override {
 225     return &InstrInfo.getRegisterInfo();
 226   }
 227
 228   const CallLowering *getCallLowering() const override {
 229     return CallLoweringInfo.get();
 230   }
 231
 232   const InlineAsmLowering *getInlineAsmLowering() const override {
 233     return InlineAsmLoweringInfo.get();
 234   }
 235
 236   InstructionSelector *getInstructionSelector() const override {
 237     return InstSelector.get();
 238   }
 239
 240   const LegalizerInfo *getLegalizerInfo() const override {
 241     return Legalizer.get();
 242   }
 243
 244   const RegisterBankInfo *getRegBankInfo() const override {
 245     return RegBankInfo.get();
 246   }
 247
 248   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
 249     return TargetID;
 250   }
 251
 252   // Nothing implemented, just prevent crashes on use.
 253   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
 254     return &TSInfo;
 255   }
 256
 257   const InstrItineraryData *getInstrItineraryData() const override {
 258     return &InstrItins;
 259   }
 260
 261   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 262
 263   Generation getGeneration() const {
 264     return (Generation)Gen;
 265   }
 266
 267   unsigned getMaxWaveScratchSize() const {
 268     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
 269     if (getGeneration() < GFX11) {
 270       // 13-bit field in units of 256-dword.
 271       return (256 * 4) * ((1 << 13) - 1);
 272     }
 273     // 15-bit field in units of 64-dword.
 274     return (64 * 4) * ((1 << 15) - 1);
 275   }
 276
 277   /// Return the number of high bits known to be zero for a frame index.
 278   unsigned getKnownHighZeroBitsForFrameIndex() const {
 279     return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
 280   }
 281
 282   int getLDSBankCount() const {
 283     return LDSBankCount;
 284   }
 285
 286   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
 287     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
 288   }
 289
 290   unsigned getConstantBusLimit(unsigned Opcode) const;
 291
 292   /// Returns if the result of this instruction with a 16-bit result returned in
 293   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
 294   /// the original value.
 295   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
 296
 297   bool hasIntClamp() const {
 298     return HasIntClamp;
 299   }
 300
 301   bool hasFP64() const {
 302     return FP64;
 303   }
 304
 305   bool hasMIMG_R128() const {
 306     return MIMG_R128;
 307   }
 308
 309   bool hasHWFP64() const {
 310     return FP64;
 311   }
 312
 313   bool hasFastFMAF32() const {
 314     return FastFMAF32;
 315   }
 316
 317   bool hasHalfRate64Ops() const {
 318     return HalfRate64Ops;
 319   }
 320
 321   bool hasFullRate64Ops() const {
 322     return FullRate64Ops;
 323   }
 324
 325   bool hasAddr64() const {
 326     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
 327   }
 328
 329   bool hasFlat() const {
 330     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
 331   }
 332
 333   // Return true if the target only has the reverse operand versions of VALU
 334   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
 335   bool hasOnlyRevVALUShifts() const {
 336     return getGeneration() >= VOLCANIC_ISLANDS;
 337   }
 338
 339   bool hasFractBug() const {
 340     return getGeneration() == SOUTHERN_ISLANDS;
 341   }
 342
 343   bool hasBFE() const {
 344     return true;
 345   }
 346
 347   bool hasBFI() const {
 348     return true;
 349   }
 350
 351   bool hasBFM() const {
 352     return hasBFE();
 353   }
 354
 355   bool hasBCNT(unsigned Size) const {
 356     return true;
 357   }
 358
 359   bool hasFFBL() const {
 360     return true;
 361   }
 362
 363   bool hasFFBH() const {
 364     return true;
 365   }
 366
 367   bool hasMed3_16() const {
 368     return getGeneration() >= AMDGPUSubtarget::GFX9;
 369   }
 370
 371   bool hasMin3Max3_16() const {
 372     return getGeneration() >= AMDGPUSubtarget::GFX9;
 373   }
 374
 375   bool hasFmaMixInsts() const {
 376     return HasFmaMixInsts;
 377   }
 378
 379   bool hasCARRY() const {
 380     return true;
 381   }
 382
 383   bool hasFMA() const {
 384     return FMA;
 385   }
 386
 387   bool hasSwap() const {
 388     return GFX9Insts;
 389   }
 390
 391   bool hasScalarPackInsts() const {
 392     return GFX9Insts;
 393   }
 394
 395   bool hasScalarMulHiInsts() const {
 396     return GFX9Insts;
 397   }
 398
 399   TrapHandlerAbi getTrapHandlerAbi() const {
 400     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
 401   }
 402
 403   bool supportsGetDoorbellID() const {
 404     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
 405     return getGeneration() >= GFX9;
 406   }
 407
 408   /// True if the offset field of DS instructions works as expected. On SI, the
 409   /// offset uses a 16-bit adder and does not always wrap properly.
 410   bool hasUsableDSOffset() const {
 411     return getGeneration() >= SEA_ISLANDS;
 412   }
 413
 414   bool unsafeDSOffsetFoldingEnabled() const {
 415     return EnableUnsafeDSOffsetFolding;
 416   }
 417
 418   /// Condition output from div_scale is usable.
 419   bool hasUsableDivScaleConditionOutput() const {
 420     return getGeneration() != SOUTHERN_ISLANDS;
 421   }
 422
 423   /// Extra wait hazard is needed in some cases before
 424   /// s_cbranch_vccnz/s_cbranch_vccz.
 425   bool hasReadVCCZBug() const {
 426     return getGeneration() <= SEA_ISLANDS;
 427   }
 428
 429   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
 430   bool partialVCCWritesUpdateVCCZ() const {
 431     return getGeneration() >= GFX10;
 432   }
 433
 434   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
 435   /// was written by a VALU instruction.
 436   bool hasSMRDReadVALUDefHazard() const {
 437     return getGeneration() == SOUTHERN_ISLANDS;
 438   }
 439
 440   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
 441   /// SGPR was written by a VALU Instruction.
 442   bool hasVMEMReadSGPRVALUDefHazard() const {
 443     return getGeneration() >= VOLCANIC_ISLANDS;
 444   }
 445
 446   bool hasRFEHazards() const {
 447     return getGeneration() >= VOLCANIC_ISLANDS;
 448   }
 449
 450   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
 451   unsigned getSetRegWaitStates() const {
 452     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
 453   }
 454
 455   bool dumpCode() const {
 456     return DumpCode;
 457   }
 458
 459   /// Return the amount of LDS that can be used that will not restrict the
 460   /// occupancy lower than WaveCount.
 461   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
 462                                            const Function &) const;
 463
 464   bool supportsMinMaxDenormModes() const {
 465     return getGeneration() >= AMDGPUSubtarget::GFX9;
 466   }
 467
 468   /// \returns If target supports S_DENORM_MODE.
 469   bool hasDenormModeInst() const {
 470     return getGeneration() >= AMDGPUSubtarget::GFX10;
 471   }
 472
 473   bool useFlatForGlobal() const {
 474     return FlatForGlobal;
 475   }
 476
 477   /// \returns If target supports ds_read/write_b128 and user enables generation
 478   /// of ds_read/write_b128.
 479   bool useDS128() const {
 480     return CIInsts && EnableDS128;
 481   }
 482
 483   /// \return If target supports ds_read/write_b96/128.
 484   bool hasDS96AndDS128() const {
 485     return CIInsts;
 486   }
 487
 488   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
 489   bool haveRoundOpsF64() const {
 490     return CIInsts;
 491   }
 492
 493   /// \returns If MUBUF instructions always perform range checking, even for
 494   /// buffer resources used for private memory access.
 495   bool privateMemoryResourceIsRangeChecked() const {
 496     return getGeneration() < AMDGPUSubtarget::GFX9;
 497   }
 498
 499   /// \returns If target requires PRT Struct NULL support (zero result registers
 500   /// for sparse texture support).
 501   bool usePRTStrictNull() const {
 502     return EnablePRTStrictNull;
 503   }
 504
 505   bool hasAutoWaitcntBeforeBarrier() const {
 506     return AutoWaitcntBeforeBarrier;
 507   }
 508
 509   bool hasUnalignedBufferAccess() const {
 510     return UnalignedBufferAccess;
 511   }
 512
 513   bool hasUnalignedBufferAccessEnabled() const {
 514     return UnalignedBufferAccess && UnalignedAccessMode;
 515   }
 516
 517   bool hasUnalignedDSAccess() const {
 518     return UnalignedDSAccess;
 519   }
 520
 521   bool hasUnalignedDSAccessEnabled() const {
 522     return UnalignedDSAccess && UnalignedAccessMode;
 523   }
 524
 525   bool hasUnalignedScratchAccess() const {
 526     return UnalignedScratchAccess;
 527   }
 528
 529   bool hasUnalignedAccessMode() const {
 530     return UnalignedAccessMode;
 531   }
 532
 533   bool hasApertureRegs() const {
 534     return HasApertureRegs;
 535   }
 536
 537   bool isTrapHandlerEnabled() const {
 538     return TrapHandler;
 539   }
 540
 541   bool isXNACKEnabled() const {
 542     return TargetID.isXnackOnOrAny();
 543   }
 544
 545   bool isTgSplitEnabled() const {
 546     return EnableTgSplit;
 547   }
 548
 549   bool isCuModeEnabled() const {
 550     return EnableCuMode;
 551   }
 552
 553   bool hasFlatAddressSpace() const {
 554     return FlatAddressSpace;
 555   }
 556
 557   bool hasFlatScrRegister() const {
 558     return hasFlatAddressSpace();
 559   }
 560
 561   bool hasFlatInstOffsets() const {
 562     return FlatInstOffsets;
 563   }
 564
 565   bool hasFlatGlobalInsts() const {
 566     return FlatGlobalInsts;
 567   }
 568
 569   bool hasFlatScratchInsts() const {
 570     return FlatScratchInsts;
 571   }
 572
 573   // Check if target supports ST addressing mode with FLAT scratch instructions.
 574   // The ST addressing mode means no registers are used, either VGPR or SGPR,
 575   // but only immediate offset is swizzled and added to the FLAT scratch base.
 576   bool hasFlatScratchSTMode() const {
 577     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
 578   }
 579
 580   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
 581
 582   bool hasScalarFlatScratchInsts() const {
 583     return ScalarFlatScratchInsts;
 584   }
 585
 586   bool enableFlatScratch() const {
 587     return flatScratchIsArchitected() ||
 588            (EnableFlatScratch && hasFlatScratchInsts());
 589   }
 590
 591   bool hasGlobalAddTidInsts() const {
 592     return GFX10_BEncoding;
 593   }
 594
 595   bool hasAtomicCSub() const {
 596     return GFX10_BEncoding;
 597   }
 598
 599   bool hasMultiDwordFlatScratchAddressing() const {
 600     return getGeneration() >= GFX9;
 601   }
 602
 603   bool hasFlatSegmentOffsetBug() const {
 604     return HasFlatSegmentOffsetBug;
 605   }
 606
 607   bool hasFlatLgkmVMemCountInOrder() const {
 608     return getGeneration() > GFX9;
 609   }
 610
 611   bool hasD16LoadStore() const {
 612     return getGeneration() >= GFX9;
 613   }
 614
 615   bool d16PreservesUnusedBits() const {
 616     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
 617   }
 618
 619   bool hasD16Images() const {
 620     return getGeneration() >= VOLCANIC_ISLANDS;
 621   }
 622
 623   /// Return if most LDS instructions have an m0 use that require m0 to be
 624   /// initialized.
 625   bool ldsRequiresM0Init() const {
 626     return getGeneration() < GFX9;
 627   }
 628
 629   // True if the hardware rewinds and replays GWS operations if a wave is
 630   // preempted.
 631   //
 632   // If this is false, a GWS operation requires testing if a nack set the
 633   // MEM_VIOL bit, and repeating if so.
 634   bool hasGWSAutoReplay() const {
 635     return getGeneration() >= GFX9;
 636   }
 637
 638   /// \returns if target has ds_gws_sema_release_all instruction.
 639   bool hasGWSSemaReleaseAll() const {
 640     return CIInsts;
 641   }
 642
 643   /// \returns true if the target has integer add/sub instructions that do not
 644   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
 645   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
 646   /// for saturation.
 647   bool hasAddNoCarry() const {
 648     return AddNoCarryInsts;
 649   }
 650
 651   bool hasUnpackedD16VMem() const {
 652     return HasUnpackedD16VMem;
 653   }
 654
 655   // Covers VS/PS/CS graphics shaders
 656   bool isMesaGfxShader(const Function &F) const {
 657     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
 658   }
 659
 660   bool hasMad64_32() const {
 661     return getGeneration() >= SEA_ISLANDS;
 662   }
 663
 664   bool hasSDWAOmod() const {
 665     return HasSDWAOmod;
 666   }
 667
 668   bool hasSDWAScalar() const {
 669     return HasSDWAScalar;
 670   }
 671
 672   bool hasSDWASdst() const {
 673     return HasSDWASdst;
 674   }
 675
 676   bool hasSDWAMac() const {
 677     return HasSDWAMac;
 678   }
 679
 680   bool hasSDWAOutModsVOPC() const {
 681     return HasSDWAOutModsVOPC;
 682   }
 683
 684   bool hasDLInsts() const {
 685     return HasDLInsts;
 686   }
 687
 688   bool hasDot1Insts() const {
 689     return HasDot1Insts;
 690   }
 691
 692   bool hasDot2Insts() const {
 693     return HasDot2Insts;
 694   }
 695
 696   bool hasDot3Insts() const {
 697     return HasDot3Insts;
 698   }
 699
 700   bool hasDot4Insts() const {
 701     return HasDot4Insts;
 702   }
 703
 704   bool hasDot5Insts() const {
 705     return HasDot5Insts;
 706   }
 707
 708   bool hasDot6Insts() const {
 709     return HasDot6Insts;
 710   }
 711
 712   bool hasDot7Insts() const {
 713     return HasDot7Insts;
 714   }
 715
 716   bool hasDot8Insts() const {
 717     return HasDot8Insts;
 718   }
 719
 720   bool hasMAIInsts() const {
 721     return HasMAIInsts;
 722   }
 723
 724   bool hasPkFmacF16Inst() const {
 725     return HasPkFmacF16Inst;
 726   }
 727
 728   bool hasAtomicFaddInsts() const {
 729     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
 730   }
 731
 732   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
 733
 734   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
 735
 736   bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; }
 737
 738   bool hasNoSdstCMPX() const {
 739     return HasNoSdstCMPX;
 740   }
 741
 742   bool hasVscnt() const {
 743     return HasVscnt;
 744   }
 745
 746   bool hasGetWaveIdInst() const {
 747     return HasGetWaveIdInst;
 748   }
 749
 750   bool hasSMemTimeInst() const {
 751     return HasSMemTimeInst;
 752   }
 753
 754   bool hasShaderCyclesRegister() const {
 755     return HasShaderCyclesRegister;
 756   }
 757
 758   bool hasVOP3Literal() const {
 759     return HasVOP3Literal;
 760   }
 761
 762   bool hasNoDataDepHazard() const {
 763     return HasNoDataDepHazard;
 764   }
 765
 766   bool vmemWriteNeedsExpWaitcnt() const {
 767     return getGeneration() < SEA_ISLANDS;
 768   }
 769
 770   // Scratch is allocated in 256 dword per wave blocks for the entire
 771   // wavefront. When viewed from the perspective of an arbitrary workitem, this
 772   // is 4-byte aligned.
 773   //
 774   // Only 4-byte alignment is really needed to access anything. Transformations
 775   // on the pointer value itself may rely on the alignment / known low bits of
 776   // the pointer. Set this to something above the minimum to avoid needing
 777   // dynamic realignment in common cases.
 778   Align getStackAlignment() const { return Align(16); }
 779
 780   bool enableMachineScheduler() const override {
 781     return true;
 782   }
 783
 784   bool useAA() const override;
 785
 786   bool enableSubRegLiveness() const override {
 787     return true;
 788   }
 789
 790   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
 791   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
 792
 793   // static wrappers
 794   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 795
 796   // XXX - Why is this here if it isn't in the default pass set?
 797   bool enableEarlyIfConversion() const override {
 798     return true;
 799   }
 800
 801   void overrideSchedPolicy(MachineSchedPolicy &Policy,
 802                            unsigned NumRegionInstrs) const override;
 803
 804   unsigned getMaxNumUserSGPRs() const {
 805     return 16;
 806   }
 807
 808   bool hasSMemRealTime() const {
 809     return HasSMemRealTime;
 810   }
 811
 812   bool hasMovrel() const {
 813     return HasMovrel;
 814   }
 815
 816   bool hasVGPRIndexMode() const {
 817     return HasVGPRIndexMode;
 818   }
 819
 820   bool useVGPRIndexMode() const;
 821
 822   bool hasScalarCompareEq64() const {
 823     return getGeneration() >= VOLCANIC_ISLANDS;
 824   }
 825
 826   bool hasScalarStores() const {
 827     return HasScalarStores;
 828   }
 829
 830   bool hasScalarAtomics() const {
 831     return HasScalarAtomics;
 832   }
 833
 834   bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
 835
 836   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
 837   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
 838
 839   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
 840   bool hasPermLane64() const { return getGeneration() >= GFX11; }
 841
 842   bool hasDPP() const {
 843     return HasDPP;
 844   }
 845
 846   bool hasDPPBroadcasts() const {
 847     return HasDPP && getGeneration() < GFX10;
 848   }
 849
 850   bool hasDPPWavefrontShifts() const {
 851     return HasDPP && getGeneration() < GFX10;
 852   }
 853
 854   bool hasDPP8() const {
 855     return HasDPP8;
 856   }
 857
 858   bool has64BitDPP() const {
 859     return Has64BitDPP;
 860   }
 861
 862   bool hasPackedFP32Ops() const {
 863     return HasPackedFP32Ops;
 864   }
 865
 866   bool hasFmaakFmamkF32Insts() const {
 867     return getGeneration() >= GFX10 || hasGFX940Insts();
 868   }
 869
 870   bool hasImageInsts() const {
 871     return HasImageInsts;
 872   }
 873
 874   bool hasExtendedImageInsts() const {
 875     return HasExtendedImageInsts;
 876   }
 877
 878   bool hasR128A16() const {
 879     return HasR128A16;
 880   }
 881
 882   bool hasGFX10A16() const {
 883     return HasGFX10A16;
 884   }
 885
 886   bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
 887
 888   bool hasG16() const { return HasG16; }
 889
 890   bool hasOffset3fBug() const {
 891     return HasOffset3fBug;
 892   }
 893
 894   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
 895
 896   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
 897
 898   bool hasNSAEncoding() const { return HasNSAEncoding; }
 899
 900   unsigned getNSAMaxSize() const { return NSAMaxSize; }
 901
 902   bool hasGFX10_AEncoding() const {
 903     return GFX10_AEncoding;
 904   }
 905
 906   bool hasGFX10_BEncoding() const {
 907     return GFX10_BEncoding;
 908   }
 909
 910   bool hasGFX10_3Insts() const {
 911     return GFX10_3Insts;
 912   }
 913
 914   bool hasMadF16() const;
 915
 916   bool hasMovB64() const { return GFX940Insts; }
 917
 918   bool hasLshlAddB64() const { return GFX940Insts; }
 919
 920   bool enableSIScheduler() const {
 921     return EnableSIScheduler;
 922   }
 923
 924   bool loadStoreOptEnabled() const {
 925     return EnableLoadStoreOpt;
 926   }
 927
 928   bool hasSGPRInitBug() const {
 929     return SGPRInitBug;
 930   }
 931
 932   bool hasUserSGPRInit16Bug() const {
 933     return UserSGPRInit16Bug;
 934   }
 935
 936   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
 937
 938   bool hasNegativeUnalignedScratchOffsetBug() const {
 939     return NegativeUnalignedScratchOffsetBug;
 940   }
 941
 942   bool hasMFMAInlineLiteralBug() const {
 943     return HasMFMAInlineLiteralBug;
 944   }
 945
 946   bool has12DWordStoreHazard() const {
 947     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
 948   }
 949
 950   // \returns true if the subtarget supports DWORDX3 load/store instructions.
 951   bool hasDwordx3LoadStores() const {
 952     return CIInsts;
 953   }
 954
 955   bool hasReadM0MovRelInterpHazard() const {
 956     return getGeneration() == AMDGPUSubtarget::GFX9;
 957   }
 958
 959   bool hasReadM0SendMsgHazard() const {
 960     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
 961            getGeneration() <= AMDGPUSubtarget::GFX9;
 962   }
 963
 964   bool hasReadM0LdsDmaHazard() const {
 965     return getGeneration() == AMDGPUSubtarget::GFX9;
 966   }
 967
 968   bool hasReadM0LdsDirectHazard() const {
 969     return getGeneration() == AMDGPUSubtarget::GFX9;
 970   }
 971
 972   bool hasVcmpxPermlaneHazard() const {
 973     return HasVcmpxPermlaneHazard;
 974   }
 975
 976   bool hasVMEMtoScalarWriteHazard() const {
 977     return HasVMEMtoScalarWriteHazard;
 978   }
 979
 980   bool hasSMEMtoVectorWriteHazard() const {
 981     return HasSMEMtoVectorWriteHazard;
 982   }
 983
 984   bool hasLDSMisalignedBug() const {
 985     return LDSMisalignedBug && !EnableCuMode;
 986   }
 987
 988   bool hasInstFwdPrefetchBug() const {
 989     return HasInstFwdPrefetchBug;
 990   }
 991
 992   bool hasVcmpxExecWARHazard() const {
 993     return HasVcmpxExecWARHazard;
 994   }
 995
 996   bool hasLdsBranchVmemWARHazard() const {
 997     return HasLdsBranchVmemWARHazard;
 998   }
 999
1000   // Has one cycle hazard on transcendental instruction feeding a
1001   // non transcendental VALU.
1002   bool hasTransForwardingHazard() const { return GFX940Insts; }
1003
1004   // Has one cycle hazard on a VALU instruction partially writing dst with
1005   // a shift of result bits feeding another VALU instruction.
1006   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1007
1008   // Cannot use op_sel with v_dot instructions.
1009   bool hasDOTOpSelHazard() const { return GFX940Insts; }
1010
1011   // Does not have HW interlocs for VALU writing and then reading SGPRs.
1012   bool hasVDecCoExecHazard() const {
1013     return GFX940Insts;
1014   }
1015
1016   bool hasNSAtoVMEMBug() const {
1017     return HasNSAtoVMEMBug;
1018   }
1019
1020   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1021
1022   bool hasHardClauses() const { return getGeneration() >= GFX10; }
1023
1024   bool hasGFX90AInsts() const { return GFX90AInsts; }
1025
1026   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1027
1028   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1029
1030   bool hasVALUPartialForwardingHazard() const {
1031     return getGeneration() >= GFX11;
1032   }
1033
1034   bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; }
1035
1036   /// Return if operations acting on VGPR tuples require even alignment.
1037   bool needsAlignedVGPRs() const { return GFX90AInsts; }
1038
1039   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1040   bool hasSPackHL() const { return GFX11Insts; }
1041
1042   /// Return true if the target's EXP instruction has the COMPR flag, which
1043   /// affects the meaning of the EN (enable) bits.
1044   bool hasCompressedExport() const { return !GFX11Insts; }
1045
1046   /// Return true if the target's EXP instruction supports the NULL export
1047   /// target.
1048   bool hasNullExportTarget() const { return !GFX11Insts; }
1049
1050   bool hasVOPDInsts() const { return HasVOPDInsts; }
1051
1052   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1053
1054   /// Return true if the target has the S_DELAY_ALU instruction.
1055   bool hasDelayAlu() const { return GFX11Insts; }
1056
1057   bool hasPackedTID() const { return HasPackedTID; }
1058
1059   // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1060   // hasGFX90AInsts is also true.
1061   bool hasGFX940Insts() const { return GFX940Insts; }
1062
1063   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1064   /// SGPRs
1065   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1066
1067   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1068   /// VGPRs
1069   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1070
1071   /// Return occupancy for the given function. Used LDS and a number of
1072   /// registers if provided.
1073   /// Note, occupancy can be affected by the scratch allocation as well, but
1074   /// we do not have enough information to compute it.
1075   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1076                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1077
1078   /// \returns true if the flat_scratch register should be initialized with the
1079   /// pointer to the wave's scratch memory rather than a size and offset.
1080   bool flatScratchIsPointer() const {
1081     return getGeneration() >= AMDGPUSubtarget::GFX9;
1082   }
1083
1084   /// \returns true if the flat_scratch register is initialized by the HW.
1085   /// In this case it is readonly.
1086   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1087
1088   /// \returns true if the machine has merged shaders in which s0-s7 are
1089   /// reserved by the hardware and user SGPRs start at s8
1090   bool hasMergedShaders() const {
1091     return getGeneration() >= GFX9;
1092   }
1093
1094   // \returns true if the target supports the pre-NGG legacy geometry path.
1095   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1096
1097   /// \returns SGPR allocation granularity supported by the subtarget.
1098   unsigned getSGPRAllocGranule() const {
1099     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1100   }
1101
1102   /// \returns SGPR encoding granularity supported by the subtarget.
1103   unsigned getSGPREncodingGranule() const {
1104     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1105   }
1106
1107   /// \returns Total number of SGPRs supported by the subtarget.
1108   unsigned getTotalNumSGPRs() const {
1109     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1110   }
1111
1112   /// \returns Addressable number of SGPRs supported by the subtarget.
1113   unsigned getAddressableNumSGPRs() const {
1114     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1115   }
1116
1117   /// \returns Minimum number of SGPRs that meets the given number of waves per
1118   /// execution unit requirement supported by the subtarget.
1119   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1120     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1121   }
1122
1123   /// \returns Maximum number of SGPRs that meets the given number of waves per
1124   /// execution unit requirement supported by the subtarget.
1125   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1126     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1127   }
1128
1129   /// \returns Reserved number of SGPRs. This is common
1130   /// utility function called by MachineFunction and
1131   /// Function variants of getReservedNumSGPRs.
1132   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1133   /// \returns Reserved number of SGPRs for given machine function \p MF.
1134   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1135
1136   /// \returns Reserved number of SGPRs for given function \p F.
1137   unsigned getReservedNumSGPRs(const Function &F) const;
1138
1139   /// \returns max num SGPRs. This is the common utility
1140   /// function called by MachineFunction and Function
1141   /// variants of getMaxNumSGPRs.
1142   unsigned getBaseMaxNumSGPRs(const Function &F,
1143                               std::pair<unsigned, unsigned> WavesPerEU,
1144                               unsigned PreloadedSGPRs,
1145                               unsigned ReservedNumSGPRs) const;
1146
1147   /// \returns Maximum number of SGPRs that meets number of waves per execution
1148   /// unit requirement for function \p MF, or number of SGPRs explicitly
1149   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1150   ///
1151   /// \returns Value that meets number of waves per execution unit requirement
1152   /// if explicitly requested value cannot be converted to integer, violates
1153   /// subtarget's specifications, or does not meet number of waves per execution
1154   /// unit requirement.
1155   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1156
1157   /// \returns Maximum number of SGPRs that meets number of waves per execution
1158   /// unit requirement for function \p F, or number of SGPRs explicitly
1159   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1160   ///
1161   /// \returns Value that meets number of waves per execution unit requirement
1162   /// if explicitly requested value cannot be converted to integer, violates
1163   /// subtarget's specifications, or does not meet number of waves per execution
1164   /// unit requirement.
1165   unsigned getMaxNumSGPRs(const Function &F) const;
1166
1167   /// \returns VGPR allocation granularity supported by the subtarget.
1168   unsigned getVGPRAllocGranule() const {
1169     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1170   }
1171
1172   /// \returns VGPR encoding granularity supported by the subtarget.
1173   unsigned getVGPREncodingGranule() const {
1174     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1175   }
1176
1177   /// \returns Total number of VGPRs supported by the subtarget.
1178   unsigned getTotalNumVGPRs() const {
1179     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1180   }
1181
1182   /// \returns Addressable number of VGPRs supported by the subtarget.
1183   unsigned getAddressableNumVGPRs() const {
1184     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1185   }
1186
1187   /// \returns Minimum number of VGPRs that meets given number of waves per
1188   /// execution unit requirement supported by the subtarget.
1189   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1190     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1191   }
1192
1193   /// \returns Maximum number of VGPRs that meets given number of waves per
1194   /// execution unit requirement supported by the subtarget.
1195   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1196     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1197   }
1198
1199   /// \returns max num VGPRs. This is the common utility function
1200   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1201   unsigned getBaseMaxNumVGPRs(const Function &F,
1202                               std::pair<unsigned, unsigned> WavesPerEU) const;
1203   /// \returns Maximum number of VGPRs that meets number of waves per execution
1204   /// unit requirement for function \p F, or number of VGPRs explicitly
1205   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1206   ///
1207   /// \returns Value that meets number of waves per execution unit requirement
1208   /// if explicitly requested value cannot be converted to integer, violates
1209   /// subtarget's specifications, or does not meet number of waves per execution
1210   /// unit requirement.
1211   unsigned getMaxNumVGPRs(const Function &F) const;
1212
1213   unsigned getMaxNumAGPRs(const Function &F) const {
1214     return getMaxNumVGPRs(F);
1215   }
1216
1217   /// \returns Maximum number of VGPRs that meets number of waves per execution
1218   /// unit requirement for function \p MF, or number of VGPRs explicitly
1219   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1220   ///
1221   /// \returns Value that meets number of waves per execution unit requirement
1222   /// if explicitly requested value cannot be converted to integer, violates
1223   /// subtarget's specifications, or does not meet number of waves per execution
1224   /// unit requirement.
1225   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1226
1227   void getPostRAMutations(
1228       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1229       const override;
1230
1231   std::unique_ptr<ScheduleDAGMutation>
1232   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1233
1234   bool isWave32() const {
1235     return getWavefrontSize() == 32;
1236   }
1237
1238   bool isWave64() const {
1239     return getWavefrontSize() == 64;
1240   }
1241
1242   const TargetRegisterClass *getBoolRC() const {
1243     return getRegisterInfo()->getBoolRC();
1244   }
1245
1246   /// \returns Maximum number of work groups per compute unit supported by the
1247   /// subtarget and limited by given \p FlatWorkGroupSize.
1248   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1249     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1250   }
1251
1252   /// \returns Minimum flat work group size supported by the subtarget.
1253   unsigned getMinFlatWorkGroupSize() const override {
1254     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1255   }
1256
1257   /// \returns Maximum flat work group size supported by the subtarget.
1258   unsigned getMaxFlatWorkGroupSize() const override {
1259     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1260   }
1261
1262   /// \returns Number of waves per execution unit required to support the given
1263   /// \p FlatWorkGroupSize.
1264   unsigned
1265   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1266     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1267   }
1268
1269   /// \returns Minimum number of waves per execution unit supported by the
1270   /// subtarget.
1271   unsigned getMinWavesPerEU() const override {
1272     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1273   }
1274
1275   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1276                              SDep &Dep) const override;
1277
1278   // \returns true if it's beneficial on this subtarget for the scheduler to
1279   // cluster stores as well as loads.
1280   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1281 };
1282
1283 } // end namespace llvm
1284
1285 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H