1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //==-----------------------------------------------------------------------===//
10 /// AMD GCN specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIFrameLowering.h"
20 #include "SIISelLowering.h"
21 #include "SIInstrInfo.h"
22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
24 #define GET_SUBTARGETINFO_HEADER
25 #include "AMDGPUGenSubtargetInfo.inc"
29 class GCNTargetMachine;
31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
32 public AMDGPUSubtarget {
34 using AMDGPUSubtarget::getMaxWavesPerEU;
37 // Following 2 enums are documented at:
38 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
39 enum class TrapHandlerAbi {
45 LLVMAMDHSATrap = 0x02,
46 LLVMAMDHSADebugTrap = 0x03,
50 /// GlobalISel related APIs.
51 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
52 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
53 std::unique_ptr<InstructionSelector> InstSelector;
54 std::unique_ptr<LegalizerInfo> Legalizer;
55 std::unique_ptr<RegisterBankInfo> RegBankInfo;
58 // Basic subtarget description.
60 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
61 unsigned Gen = INVALID;
62 InstrItineraryData InstrItins;
64 unsigned MaxPrivateElementSize = 0;
66 // Possibly statically set by tablegen, but may want to be overridden.
67 bool FastFMAF32 = false;
68 bool FastDenormalF32 = false;
69 bool HalfRate64Ops = false;
70 bool FullRate64Ops = false;
72 // Dynamically set bits that enable features.
73 bool FlatForGlobal = false;
74 bool AutoWaitcntBeforeBarrier = false;
75 bool UnalignedScratchAccess = false;
76 bool UnalignedAccessMode = false;
77 bool HasApertureRegs = false;
78 bool SupportsXNACK = false;
80 // This should not be used directly. 'TargetID' tracks the dynamic settings
82 bool EnableXNACK = false;
84 bool EnableTgSplit = false;
85 bool EnableCuMode = false;
86 bool TrapHandler = false;
89 bool EnableLoadStoreOpt = false;
90 bool EnableUnsafeDSOffsetFolding = false;
91 bool EnableSIScheduler = false;
92 bool EnableDS128 = false;
93 bool EnablePRTStrictNull = false;
94 bool DumpCode = false;
96 // Subtarget statically properties set by tablegen
99 bool MIMG_R128 = false;
100 bool CIInsts = false;
101 bool GFX8Insts = false;
102 bool GFX9Insts = false;
103 bool GFX90AInsts = false;
104 bool GFX940Insts = false;
105 bool GFX10Insts = false;
106 bool GFX11Insts = false;
107 bool GFX10_3Insts = false;
108 bool GFX7GFX8GFX9Insts = false;
109 bool SGPRInitBug = false;
110 bool UserSGPRInit16Bug = false;
111 bool NegativeScratchOffsetBug = false;
112 bool NegativeUnalignedScratchOffsetBug = false;
113 bool HasSMemRealTime = false;
114 bool HasIntClamp = false;
115 bool HasFmaMixInsts = false;
116 bool HasMovrel = false;
117 bool HasVGPRIndexMode = false;
118 bool HasScalarStores = false;
119 bool HasScalarAtomics = false;
120 bool HasSDWAOmod = false;
121 bool HasSDWAScalar = false;
122 bool HasSDWASdst = false;
123 bool HasSDWAMac = false;
124 bool HasSDWAOutModsVOPC = false;
126 bool HasDPP8 = false;
127 bool Has64BitDPP = false;
128 bool HasPackedFP32Ops = false;
129 bool HasImageInsts = false;
130 bool HasExtendedImageInsts = false;
131 bool HasR128A16 = false;
132 bool HasGFX10A16 = false;
134 bool HasNSAEncoding = false;
135 unsigned NSAMaxSize = 0;
136 bool GFX10_AEncoding = false;
137 bool GFX10_BEncoding = false;
138 bool HasDLInsts = false;
139 bool HasDot1Insts = false;
140 bool HasDot2Insts = false;
141 bool HasDot3Insts = false;
142 bool HasDot4Insts = false;
143 bool HasDot5Insts = false;
144 bool HasDot6Insts = false;
145 bool HasDot7Insts = false;
146 bool HasDot8Insts = false;
147 bool HasMAIInsts = false;
148 bool HasPkFmacF16Inst = false;
149 bool HasAtomicFaddRtnInsts = false;
150 bool HasAtomicFaddNoRtnInsts = false;
151 bool HasAtomicPkFaddNoRtnInsts = false;
152 bool SupportsSRAMECC = false;
154 // This should not be used directly. 'TargetID' tracks the dynamic settings
156 bool EnableSRAMECC = false;
158 bool HasNoSdstCMPX = false;
159 bool HasVscnt = false;
160 bool HasGetWaveIdInst = false;
161 bool HasSMemTimeInst = false;
162 bool HasShaderCyclesRegister = false;
163 bool HasVOP3Literal = false;
164 bool HasNoDataDepHazard = false;
165 bool FlatAddressSpace = false;
166 bool FlatInstOffsets = false;
167 bool FlatGlobalInsts = false;
168 bool FlatScratchInsts = false;
169 bool ScalarFlatScratchInsts = false;
170 bool HasArchitectedFlatScratch = false;
171 bool EnableFlatScratch = false;
172 bool AddNoCarryInsts = false;
173 bool HasUnpackedD16VMem = false;
174 bool LDSMisalignedBug = false;
175 bool HasMFMAInlineLiteralBug = false;
176 bool UnalignedBufferAccess = false;
177 bool UnalignedDSAccess = false;
178 bool HasPackedTID = false;
179 bool ScalarizeGlobal = false;
181 bool HasVcmpxPermlaneHazard = false;
182 bool HasVMEMtoScalarWriteHazard = false;
183 bool HasSMEMtoVectorWriteHazard = false;
184 bool HasInstFwdPrefetchBug = false;
185 bool HasVcmpxExecWARHazard = false;
186 bool HasLdsBranchVmemWARHazard = false;
187 bool HasNSAtoVMEMBug = false;
188 bool HasNSAClauseBug = false;
189 bool HasOffset3fBug = false;
190 bool HasFlatSegmentOffsetBug = false;
191 bool HasImageStoreD16Bug = false;
192 bool HasImageGather4D16Bug = false;
193 bool HasVOPDInsts = false;
195 // Dummy feature to use for assembler in tablegen.
196 bool FeatureDisable = false;
198 SelectionDAGTargetInfo TSInfo;
200 SIInstrInfo InstrInfo;
201 SITargetLowering TLInfo;
202 SIFrameLowering FrameLowering;
205 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
206 const GCNTargetMachine &TM);
207 ~GCNSubtarget() override;
209 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
210 StringRef GPU, StringRef FS);
212 const SIInstrInfo *getInstrInfo() const override {
216 const SIFrameLowering *getFrameLowering() const override {
217 return &FrameLowering;
220 const SITargetLowering *getTargetLowering() const override {
224 const SIRegisterInfo *getRegisterInfo() const override {
225 return &InstrInfo.getRegisterInfo();
228 const CallLowering *getCallLowering() const override {
229 return CallLoweringInfo.get();
232 const InlineAsmLowering *getInlineAsmLowering() const override {
233 return InlineAsmLoweringInfo.get();
236 InstructionSelector *getInstructionSelector() const override {
237 return InstSelector.get();
240 const LegalizerInfo *getLegalizerInfo() const override {
241 return Legalizer.get();
244 const RegisterBankInfo *getRegBankInfo() const override {
245 return RegBankInfo.get();
248 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
252 // Nothing implemented, just prevent crashes on use.
253 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
257 const InstrItineraryData *getInstrItineraryData() const override {
261 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
263 Generation getGeneration() const {
264 return (Generation)Gen;
267 unsigned getMaxWaveScratchSize() const {
268 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
269 if (getGeneration() < GFX11) {
270 // 13-bit field in units of 256-dword.
271 return (256 * 4) * ((1 << 13) - 1);
273 // 15-bit field in units of 64-dword.
274 return (64 * 4) * ((1 << 15) - 1);
277 /// Return the number of high bits known to be zero for a frame index.
278 unsigned getKnownHighZeroBitsForFrameIndex() const {
279 return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
282 int getLDSBankCount() const {
286 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
287 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
290 unsigned getConstantBusLimit(unsigned Opcode) const;
292 /// Returns if the result of this instruction with a 16-bit result returned in
293 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
294 /// the original value.
295 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
297 bool hasIntClamp() const {
301 bool hasFP64() const {
305 bool hasMIMG_R128() const {
309 bool hasHWFP64() const {
313 bool hasFastFMAF32() const {
317 bool hasHalfRate64Ops() const {
318 return HalfRate64Ops;
321 bool hasFullRate64Ops() const {
322 return FullRate64Ops;
325 bool hasAddr64() const {
326 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
329 bool hasFlat() const {
330 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
333 // Return true if the target only has the reverse operand versions of VALU
334 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
335 bool hasOnlyRevVALUShifts() const {
336 return getGeneration() >= VOLCANIC_ISLANDS;
339 bool hasFractBug() const {
340 return getGeneration() == SOUTHERN_ISLANDS;
343 bool hasBFE() const {
347 bool hasBFI() const {
351 bool hasBFM() const {
355 bool hasBCNT(unsigned Size) const {
359 bool hasFFBL() const {
363 bool hasFFBH() const {
367 bool hasMed3_16() const {
368 return getGeneration() >= AMDGPUSubtarget::GFX9;
371 bool hasMin3Max3_16() const {
372 return getGeneration() >= AMDGPUSubtarget::GFX9;
375 bool hasFmaMixInsts() const {
376 return HasFmaMixInsts;
379 bool hasCARRY() const {
383 bool hasFMA() const {
387 bool hasSwap() const {
391 bool hasScalarPackInsts() const {
395 bool hasScalarMulHiInsts() const {
399 TrapHandlerAbi getTrapHandlerAbi() const {
400 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
403 bool supportsGetDoorbellID() const {
404 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
405 return getGeneration() >= GFX9;
408 /// True if the offset field of DS instructions works as expected. On SI, the
409 /// offset uses a 16-bit adder and does not always wrap properly.
410 bool hasUsableDSOffset() const {
411 return getGeneration() >= SEA_ISLANDS;
414 bool unsafeDSOffsetFoldingEnabled() const {
415 return EnableUnsafeDSOffsetFolding;
418 /// Condition output from div_scale is usable.
419 bool hasUsableDivScaleConditionOutput() const {
420 return getGeneration() != SOUTHERN_ISLANDS;
423 /// Extra wait hazard is needed in some cases before
424 /// s_cbranch_vccnz/s_cbranch_vccz.
425 bool hasReadVCCZBug() const {
426 return getGeneration() <= SEA_ISLANDS;
429 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
430 bool partialVCCWritesUpdateVCCZ() const {
431 return getGeneration() >= GFX10;
434 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
435 /// was written by a VALU instruction.
436 bool hasSMRDReadVALUDefHazard() const {
437 return getGeneration() == SOUTHERN_ISLANDS;
440 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
441 /// SGPR was written by a VALU Instruction.
442 bool hasVMEMReadSGPRVALUDefHazard() const {
443 return getGeneration() >= VOLCANIC_ISLANDS;
446 bool hasRFEHazards() const {
447 return getGeneration() >= VOLCANIC_ISLANDS;
450 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
451 unsigned getSetRegWaitStates() const {
452 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
455 bool dumpCode() const {
459 /// Return the amount of LDS that can be used that will not restrict the
460 /// occupancy lower than WaveCount.
461 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
462 const Function &) const;
464 bool supportsMinMaxDenormModes() const {
465 return getGeneration() >= AMDGPUSubtarget::GFX9;
468 /// \returns If target supports S_DENORM_MODE.
469 bool hasDenormModeInst() const {
470 return getGeneration() >= AMDGPUSubtarget::GFX10;
473 bool useFlatForGlobal() const {
474 return FlatForGlobal;
477 /// \returns If target supports ds_read/write_b128 and user enables generation
478 /// of ds_read/write_b128.
479 bool useDS128() const {
480 return CIInsts && EnableDS128;
483 /// \return If target supports ds_read/write_b96/128.
484 bool hasDS96AndDS128() const {
488 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
489 bool haveRoundOpsF64() const {
493 /// \returns If MUBUF instructions always perform range checking, even for
494 /// buffer resources used for private memory access.
495 bool privateMemoryResourceIsRangeChecked() const {
496 return getGeneration() < AMDGPUSubtarget::GFX9;
499 /// \returns If target requires PRT Struct NULL support (zero result registers
500 /// for sparse texture support).
501 bool usePRTStrictNull() const {
502 return EnablePRTStrictNull;
505 bool hasAutoWaitcntBeforeBarrier() const {
506 return AutoWaitcntBeforeBarrier;
509 bool hasUnalignedBufferAccess() const {
510 return UnalignedBufferAccess;
513 bool hasUnalignedBufferAccessEnabled() const {
514 return UnalignedBufferAccess && UnalignedAccessMode;
517 bool hasUnalignedDSAccess() const {
518 return UnalignedDSAccess;
521 bool hasUnalignedDSAccessEnabled() const {
522 return UnalignedDSAccess && UnalignedAccessMode;
525 bool hasUnalignedScratchAccess() const {
526 return UnalignedScratchAccess;
529 bool hasUnalignedAccessMode() const {
530 return UnalignedAccessMode;
533 bool hasApertureRegs() const {
534 return HasApertureRegs;
537 bool isTrapHandlerEnabled() const {
541 bool isXNACKEnabled() const {
542 return TargetID.isXnackOnOrAny();
545 bool isTgSplitEnabled() const {
546 return EnableTgSplit;
549 bool isCuModeEnabled() const {
553 bool hasFlatAddressSpace() const {
554 return FlatAddressSpace;
557 bool hasFlatScrRegister() const {
558 return hasFlatAddressSpace();
561 bool hasFlatInstOffsets() const {
562 return FlatInstOffsets;
565 bool hasFlatGlobalInsts() const {
566 return FlatGlobalInsts;
569 bool hasFlatScratchInsts() const {
570 return FlatScratchInsts;
573 // Check if target supports ST addressing mode with FLAT scratch instructions.
574 // The ST addressing mode means no registers are used, either VGPR or SGPR,
575 // but only immediate offset is swizzled and added to the FLAT scratch base.
576 bool hasFlatScratchSTMode() const {
577 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
580 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
582 bool hasScalarFlatScratchInsts() const {
583 return ScalarFlatScratchInsts;
586 bool enableFlatScratch() const {
587 return flatScratchIsArchitected() ||
588 (EnableFlatScratch && hasFlatScratchInsts());
591 bool hasGlobalAddTidInsts() const {
592 return GFX10_BEncoding;
595 bool hasAtomicCSub() const {
596 return GFX10_BEncoding;
599 bool hasMultiDwordFlatScratchAddressing() const {
600 return getGeneration() >= GFX9;
603 bool hasFlatSegmentOffsetBug() const {
604 return HasFlatSegmentOffsetBug;
607 bool hasFlatLgkmVMemCountInOrder() const {
608 return getGeneration() > GFX9;
611 bool hasD16LoadStore() const {
612 return getGeneration() >= GFX9;
615 bool d16PreservesUnusedBits() const {
616 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
619 bool hasD16Images() const {
620 return getGeneration() >= VOLCANIC_ISLANDS;
623 /// Return if most LDS instructions have an m0 use that require m0 to be
625 bool ldsRequiresM0Init() const {
626 return getGeneration() < GFX9;
629 // True if the hardware rewinds and replays GWS operations if a wave is
632 // If this is false, a GWS operation requires testing if a nack set the
633 // MEM_VIOL bit, and repeating if so.
634 bool hasGWSAutoReplay() const {
635 return getGeneration() >= GFX9;
638 /// \returns if target has ds_gws_sema_release_all instruction.
639 bool hasGWSSemaReleaseAll() const {
643 /// \returns true if the target has integer add/sub instructions that do not
644 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
645 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
647 bool hasAddNoCarry() const {
648 return AddNoCarryInsts;
651 bool hasUnpackedD16VMem() const {
652 return HasUnpackedD16VMem;
655 // Covers VS/PS/CS graphics shaders
656 bool isMesaGfxShader(const Function &F) const {
657 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
660 bool hasMad64_32() const {
661 return getGeneration() >= SEA_ISLANDS;
664 bool hasSDWAOmod() const {
668 bool hasSDWAScalar() const {
669 return HasSDWAScalar;
672 bool hasSDWASdst() const {
676 bool hasSDWAMac() const {
680 bool hasSDWAOutModsVOPC() const {
681 return HasSDWAOutModsVOPC;
684 bool hasDLInsts() const {
688 bool hasDot1Insts() const {
692 bool hasDot2Insts() const {
696 bool hasDot3Insts() const {
700 bool hasDot4Insts() const {
704 bool hasDot5Insts() const {
708 bool hasDot6Insts() const {
712 bool hasDot7Insts() const {
716 bool hasDot8Insts() const {
720 bool hasMAIInsts() const {
724 bool hasPkFmacF16Inst() const {
725 return HasPkFmacF16Inst;
728 bool hasAtomicFaddInsts() const {
729 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
732 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
734 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
736 bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; }
738 bool hasNoSdstCMPX() const {
739 return HasNoSdstCMPX;
742 bool hasVscnt() const {
746 bool hasGetWaveIdInst() const {
747 return HasGetWaveIdInst;
750 bool hasSMemTimeInst() const {
751 return HasSMemTimeInst;
754 bool hasShaderCyclesRegister() const {
755 return HasShaderCyclesRegister;
758 bool hasVOP3Literal() const {
759 return HasVOP3Literal;
762 bool hasNoDataDepHazard() const {
763 return HasNoDataDepHazard;
766 bool vmemWriteNeedsExpWaitcnt() const {
767 return getGeneration() < SEA_ISLANDS;
770 // Scratch is allocated in 256 dword per wave blocks for the entire
771 // wavefront. When viewed from the perspective of an arbitrary workitem, this
772 // is 4-byte aligned.
774 // Only 4-byte alignment is really needed to access anything. Transformations
775 // on the pointer value itself may rely on the alignment / known low bits of
776 // the pointer. Set this to something above the minimum to avoid needing
777 // dynamic realignment in common cases.
778 Align getStackAlignment() const { return Align(16); }
780 bool enableMachineScheduler() const override {
784 bool useAA() const override;
786 bool enableSubRegLiveness() const override {
790 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
791 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
794 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
796 // XXX - Why is this here if it isn't in the default pass set?
797 bool enableEarlyIfConversion() const override {
801 void overrideSchedPolicy(MachineSchedPolicy &Policy,
802 unsigned NumRegionInstrs) const override;
804 unsigned getMaxNumUserSGPRs() const {
808 bool hasSMemRealTime() const {
809 return HasSMemRealTime;
812 bool hasMovrel() const {
816 bool hasVGPRIndexMode() const {
817 return HasVGPRIndexMode;
820 bool useVGPRIndexMode() const;
822 bool hasScalarCompareEq64() const {
823 return getGeneration() >= VOLCANIC_ISLANDS;
826 bool hasScalarStores() const {
827 return HasScalarStores;
830 bool hasScalarAtomics() const {
831 return HasScalarAtomics;
834 bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
836 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
837 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
839 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
840 bool hasPermLane64() const { return getGeneration() >= GFX11; }
842 bool hasDPP() const {
846 bool hasDPPBroadcasts() const {
847 return HasDPP && getGeneration() < GFX10;
850 bool hasDPPWavefrontShifts() const {
851 return HasDPP && getGeneration() < GFX10;
854 bool hasDPP8() const {
858 bool has64BitDPP() const {
862 bool hasPackedFP32Ops() const {
863 return HasPackedFP32Ops;
866 bool hasFmaakFmamkF32Insts() const {
867 return getGeneration() >= GFX10 || hasGFX940Insts();
870 bool hasImageInsts() const {
871 return HasImageInsts;
874 bool hasExtendedImageInsts() const {
875 return HasExtendedImageInsts;
878 bool hasR128A16() const {
882 bool hasGFX10A16() const {
886 bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
888 bool hasG16() const { return HasG16; }
890 bool hasOffset3fBug() const {
891 return HasOffset3fBug;
894 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
896 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
898 bool hasNSAEncoding() const { return HasNSAEncoding; }
900 unsigned getNSAMaxSize() const { return NSAMaxSize; }
902 bool hasGFX10_AEncoding() const {
903 return GFX10_AEncoding;
906 bool hasGFX10_BEncoding() const {
907 return GFX10_BEncoding;
910 bool hasGFX10_3Insts() const {
914 bool hasMadF16() const;
916 bool hasMovB64() const { return GFX940Insts; }
918 bool hasLshlAddB64() const { return GFX940Insts; }
920 bool enableSIScheduler() const {
921 return EnableSIScheduler;
924 bool loadStoreOptEnabled() const {
925 return EnableLoadStoreOpt;
928 bool hasSGPRInitBug() const {
932 bool hasUserSGPRInit16Bug() const {
933 return UserSGPRInit16Bug;
936 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
938 bool hasNegativeUnalignedScratchOffsetBug() const {
939 return NegativeUnalignedScratchOffsetBug;
942 bool hasMFMAInlineLiteralBug() const {
943 return HasMFMAInlineLiteralBug;
946 bool has12DWordStoreHazard() const {
947 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
950 // \returns true if the subtarget supports DWORDX3 load/store instructions.
951 bool hasDwordx3LoadStores() const {
955 bool hasReadM0MovRelInterpHazard() const {
956 return getGeneration() == AMDGPUSubtarget::GFX9;
959 bool hasReadM0SendMsgHazard() const {
960 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
961 getGeneration() <= AMDGPUSubtarget::GFX9;
964 bool hasReadM0LdsDmaHazard() const {
965 return getGeneration() == AMDGPUSubtarget::GFX9;
968 bool hasReadM0LdsDirectHazard() const {
969 return getGeneration() == AMDGPUSubtarget::GFX9;
972 bool hasVcmpxPermlaneHazard() const {
973 return HasVcmpxPermlaneHazard;
976 bool hasVMEMtoScalarWriteHazard() const {
977 return HasVMEMtoScalarWriteHazard;
980 bool hasSMEMtoVectorWriteHazard() const {
981 return HasSMEMtoVectorWriteHazard;
984 bool hasLDSMisalignedBug() const {
985 return LDSMisalignedBug && !EnableCuMode;
988 bool hasInstFwdPrefetchBug() const {
989 return HasInstFwdPrefetchBug;
992 bool hasVcmpxExecWARHazard() const {
993 return HasVcmpxExecWARHazard;
996 bool hasLdsBranchVmemWARHazard() const {
997 return HasLdsBranchVmemWARHazard;
1000 // Has one cycle hazard on transcendental instruction feeding a
1001 // non transcendental VALU.
1002 bool hasTransForwardingHazard() const { return GFX940Insts; }
1004 // Has one cycle hazard on a VALU instruction partially writing dst with
1005 // a shift of result bits feeding another VALU instruction.
1006 bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1008 // Cannot use op_sel with v_dot instructions.
1009 bool hasDOTOpSelHazard() const { return GFX940Insts; }
1011 // Does not have HW interlocs for VALU writing and then reading SGPRs.
1012 bool hasVDecCoExecHazard() const {
1016 bool hasNSAtoVMEMBug() const {
1017 return HasNSAtoVMEMBug;
1020 bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1022 bool hasHardClauses() const { return getGeneration() >= GFX10; }
1024 bool hasGFX90AInsts() const { return GFX90AInsts; }
1026 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1028 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1030 bool hasVALUPartialForwardingHazard() const {
1031 return getGeneration() >= GFX11;
1034 bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; }
1036 /// Return if operations acting on VGPR tuples require even alignment.
1037 bool needsAlignedVGPRs() const { return GFX90AInsts; }
1039 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1040 bool hasSPackHL() const { return GFX11Insts; }
1042 /// Return true if the target's EXP instruction has the COMPR flag, which
1043 /// affects the meaning of the EN (enable) bits.
1044 bool hasCompressedExport() const { return !GFX11Insts; }
1046 /// Return true if the target's EXP instruction supports the NULL export
1048 bool hasNullExportTarget() const { return !GFX11Insts; }
1050 bool hasVOPDInsts() const { return HasVOPDInsts; }
1052 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1054 /// Return true if the target has the S_DELAY_ALU instruction.
1055 bool hasDelayAlu() const { return GFX11Insts; }
1057 bool hasPackedTID() const { return HasPackedTID; }
1059 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1060 // hasGFX90AInsts is also true.
1061 bool hasGFX940Insts() const { return GFX940Insts; }
1063 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1065 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1067 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1069 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1071 /// Return occupancy for the given function. Used LDS and a number of
1072 /// registers if provided.
1073 /// Note, occupancy can be affected by the scratch allocation as well, but
1074 /// we do not have enough information to compute it.
1075 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1076 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1078 /// \returns true if the flat_scratch register should be initialized with the
1079 /// pointer to the wave's scratch memory rather than a size and offset.
1080 bool flatScratchIsPointer() const {
1081 return getGeneration() >= AMDGPUSubtarget::GFX9;
1084 /// \returns true if the flat_scratch register is initialized by the HW.
1085 /// In this case it is readonly.
1086 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1088 /// \returns true if the machine has merged shaders in which s0-s7 are
1089 /// reserved by the hardware and user SGPRs start at s8
1090 bool hasMergedShaders() const {
1091 return getGeneration() >= GFX9;
1094 // \returns true if the target supports the pre-NGG legacy geometry path.
1095 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1097 /// \returns SGPR allocation granularity supported by the subtarget.
1098 unsigned getSGPRAllocGranule() const {
1099 return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1102 /// \returns SGPR encoding granularity supported by the subtarget.
1103 unsigned getSGPREncodingGranule() const {
1104 return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1107 /// \returns Total number of SGPRs supported by the subtarget.
1108 unsigned getTotalNumSGPRs() const {
1109 return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1112 /// \returns Addressable number of SGPRs supported by the subtarget.
1113 unsigned getAddressableNumSGPRs() const {
1114 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1117 /// \returns Minimum number of SGPRs that meets the given number of waves per
1118 /// execution unit requirement supported by the subtarget.
1119 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1120 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1123 /// \returns Maximum number of SGPRs that meets the given number of waves per
1124 /// execution unit requirement supported by the subtarget.
1125 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1126 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1129 /// \returns Reserved number of SGPRs. This is common
1130 /// utility function called by MachineFunction and
1131 /// Function variants of getReservedNumSGPRs.
1132 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1133 /// \returns Reserved number of SGPRs for given machine function \p MF.
1134 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1136 /// \returns Reserved number of SGPRs for given function \p F.
1137 unsigned getReservedNumSGPRs(const Function &F) const;
1139 /// \returns max num SGPRs. This is the common utility
1140 /// function called by MachineFunction and Function
1141 /// variants of getMaxNumSGPRs.
1142 unsigned getBaseMaxNumSGPRs(const Function &F,
1143 std::pair<unsigned, unsigned> WavesPerEU,
1144 unsigned PreloadedSGPRs,
1145 unsigned ReservedNumSGPRs) const;
1147 /// \returns Maximum number of SGPRs that meets number of waves per execution
1148 /// unit requirement for function \p MF, or number of SGPRs explicitly
1149 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1151 /// \returns Value that meets number of waves per execution unit requirement
1152 /// if explicitly requested value cannot be converted to integer, violates
1153 /// subtarget's specifications, or does not meet number of waves per execution
1154 /// unit requirement.
1155 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1157 /// \returns Maximum number of SGPRs that meets number of waves per execution
1158 /// unit requirement for function \p F, or number of SGPRs explicitly
1159 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1161 /// \returns Value that meets number of waves per execution unit requirement
1162 /// if explicitly requested value cannot be converted to integer, violates
1163 /// subtarget's specifications, or does not meet number of waves per execution
1164 /// unit requirement.
1165 unsigned getMaxNumSGPRs(const Function &F) const;
1167 /// \returns VGPR allocation granularity supported by the subtarget.
1168 unsigned getVGPRAllocGranule() const {
1169 return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1172 /// \returns VGPR encoding granularity supported by the subtarget.
1173 unsigned getVGPREncodingGranule() const {
1174 return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1177 /// \returns Total number of VGPRs supported by the subtarget.
1178 unsigned getTotalNumVGPRs() const {
1179 return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1182 /// \returns Addressable number of VGPRs supported by the subtarget.
1183 unsigned getAddressableNumVGPRs() const {
1184 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1187 /// \returns Minimum number of VGPRs that meets given number of waves per
1188 /// execution unit requirement supported by the subtarget.
1189 unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1190 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1193 /// \returns Maximum number of VGPRs that meets given number of waves per
1194 /// execution unit requirement supported by the subtarget.
1195 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1196 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1199 /// \returns max num VGPRs. This is the common utility function
1200 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1201 unsigned getBaseMaxNumVGPRs(const Function &F,
1202 std::pair<unsigned, unsigned> WavesPerEU) const;
1203 /// \returns Maximum number of VGPRs that meets number of waves per execution
1204 /// unit requirement for function \p F, or number of VGPRs explicitly
1205 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1207 /// \returns Value that meets number of waves per execution unit requirement
1208 /// if explicitly requested value cannot be converted to integer, violates
1209 /// subtarget's specifications, or does not meet number of waves per execution
1210 /// unit requirement.
1211 unsigned getMaxNumVGPRs(const Function &F) const;
1213 unsigned getMaxNumAGPRs(const Function &F) const {
1214 return getMaxNumVGPRs(F);
1217 /// \returns Maximum number of VGPRs that meets number of waves per execution
1218 /// unit requirement for function \p MF, or number of VGPRs explicitly
1219 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1221 /// \returns Value that meets number of waves per execution unit requirement
1222 /// if explicitly requested value cannot be converted to integer, violates
1223 /// subtarget's specifications, or does not meet number of waves per execution
1224 /// unit requirement.
1225 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1227 void getPostRAMutations(
1228 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1231 std::unique_ptr<ScheduleDAGMutation>
1232 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1234 bool isWave32() const {
1235 return getWavefrontSize() == 32;
1238 bool isWave64() const {
1239 return getWavefrontSize() == 64;
1242 const TargetRegisterClass *getBoolRC() const {
1243 return getRegisterInfo()->getBoolRC();
1246 /// \returns Maximum number of work groups per compute unit supported by the
1247 /// subtarget and limited by given \p FlatWorkGroupSize.
1248 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1249 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1252 /// \returns Minimum flat work group size supported by the subtarget.
1253 unsigned getMinFlatWorkGroupSize() const override {
1254 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1257 /// \returns Maximum flat work group size supported by the subtarget.
1258 unsigned getMaxFlatWorkGroupSize() const override {
1259 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1262 /// \returns Number of waves per execution unit required to support the given
1263 /// \p FlatWorkGroupSize.
1265 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1266 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1269 /// \returns Minimum number of waves per execution unit supported by the
1271 unsigned getMinWavesPerEU() const override {
1272 return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1275 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1276 SDep &Dep) const override;
1278 // \returns true if it's beneficial on this subtarget for the scheduler to
1279 // cluster stores as well as loads.
1280 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1283 } // end namespace llvm
1285 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H