1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //==-----------------------------------------------------------------------===//
10 /// AMD GCN specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIFrameLowering.h"
20 #include "SIISelLowering.h"
21 #include "SIInstrInfo.h"
22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
24 #define GET_SUBTARGETINFO_HEADER
25 #include "AMDGPUGenSubtargetInfo.inc"
29 class GCNTargetMachine;
31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
32 public AMDGPUSubtarget {
34 using AMDGPUSubtarget::getMaxWavesPerEU;
36 // Following 2 enums are documented at:
37 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
38 enum class TrapHandlerAbi {
44 LLVMAMDHSATrap = 0x02,
45 LLVMAMDHSADebugTrap = 0x03,
49 /// GlobalISel related APIs.
50 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
51 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
52 std::unique_ptr<InstructionSelector> InstSelector;
53 std::unique_ptr<LegalizerInfo> Legalizer;
54 std::unique_ptr<RegisterBankInfo> RegBankInfo;
57 // Basic subtarget description.
59 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
60 unsigned Gen = INVALID;
61 InstrItineraryData InstrItins;
63 unsigned MaxPrivateElementSize = 0;
65 // Possibly statically set by tablegen, but may want to be overridden.
66 bool FastFMAF32 = false;
67 bool FastDenormalF32 = false;
68 bool HalfRate64Ops = false;
69 bool FullRate64Ops = false;
71 // Dynamically set bits that enable features.
72 bool FlatForGlobal = false;
73 bool AutoWaitcntBeforeBarrier = false;
74 bool BackOffBarrier = false;
75 bool UnalignedScratchAccess = false;
76 bool UnalignedAccessMode = false;
77 bool HasApertureRegs = false;
78 bool SupportsXNACK = false;
80 // This should not be used directly. 'TargetID' tracks the dynamic settings
82 bool EnableXNACK = false;
84 bool EnableTgSplit = false;
85 bool EnableCuMode = false;
86 bool TrapHandler = false;
89 bool EnableLoadStoreOpt = false;
90 bool EnableUnsafeDSOffsetFolding = false;
91 bool EnableSIScheduler = false;
92 bool EnableDS128 = false;
93 bool EnablePRTStrictNull = false;
94 bool DumpCode = false;
96 // Subtarget statically properties set by tablegen
99 bool MIMG_R128 = false;
100 bool CIInsts = false;
101 bool GFX8Insts = false;
102 bool GFX9Insts = false;
103 bool GFX90AInsts = false;
104 bool GFX940Insts = false;
105 bool GFX10Insts = false;
106 bool GFX11Insts = false;
107 bool GFX10_3Insts = false;
108 bool GFX7GFX8GFX9Insts = false;
109 bool SGPRInitBug = false;
110 bool UserSGPRInit16Bug = false;
111 bool NegativeScratchOffsetBug = false;
112 bool NegativeUnalignedScratchOffsetBug = false;
113 bool HasSMemRealTime = false;
114 bool HasIntClamp = false;
115 bool HasFmaMixInsts = false;
116 bool HasMovrel = false;
117 bool HasVGPRIndexMode = false;
118 bool HasScalarStores = false;
119 bool HasScalarAtomics = false;
120 bool HasSDWAOmod = false;
121 bool HasSDWAScalar = false;
122 bool HasSDWASdst = false;
123 bool HasSDWAMac = false;
124 bool HasSDWAOutModsVOPC = false;
126 bool HasDPP8 = false;
127 bool Has64BitDPP = false;
128 bool HasPackedFP32Ops = false;
129 bool HasImageInsts = false;
130 bool HasExtendedImageInsts = false;
131 bool HasR128A16 = false;
134 bool HasNSAEncoding = false;
135 unsigned NSAMaxSize = 0;
136 bool GFX10_AEncoding = false;
137 bool GFX10_BEncoding = false;
138 bool HasDLInsts = false;
139 bool HasFmacF64Inst = false;
140 bool HasDot1Insts = false;
141 bool HasDot2Insts = false;
142 bool HasDot3Insts = false;
143 bool HasDot4Insts = false;
144 bool HasDot5Insts = false;
145 bool HasDot6Insts = false;
146 bool HasDot7Insts = false;
147 bool HasDot8Insts = false;
148 bool HasDot9Insts = false;
149 bool HasMAIInsts = false;
150 bool HasFP8Insts = false;
151 bool HasPkFmacF16Inst = false;
152 bool HasAtomicFaddRtnInsts = false;
153 bool HasAtomicFaddNoRtnInsts = false;
154 bool HasAtomicPkFaddNoRtnInsts = false;
155 bool HasFlatAtomicFaddF32Inst = false;
156 bool SupportsSRAMECC = false;
158 // This should not be used directly. 'TargetID' tracks the dynamic settings
160 bool EnableSRAMECC = false;
162 bool HasNoSdstCMPX = false;
163 bool HasVscnt = false;
164 bool HasGetWaveIdInst = false;
165 bool HasSMemTimeInst = false;
166 bool HasShaderCyclesRegister = false;
167 bool HasVOP3Literal = false;
168 bool HasNoDataDepHazard = false;
169 bool FlatAddressSpace = false;
170 bool FlatInstOffsets = false;
171 bool FlatGlobalInsts = false;
172 bool FlatScratchInsts = false;
173 bool ScalarFlatScratchInsts = false;
174 bool HasArchitectedFlatScratch = false;
175 bool EnableFlatScratch = false;
176 bool AddNoCarryInsts = false;
177 bool HasUnpackedD16VMem = false;
178 bool LDSMisalignedBug = false;
179 bool HasMFMAInlineLiteralBug = false;
180 bool UnalignedBufferAccess = false;
181 bool UnalignedDSAccess = false;
182 bool HasPackedTID = false;
183 bool ScalarizeGlobal = false;
185 bool HasVcmpxPermlaneHazard = false;
186 bool HasVMEMtoScalarWriteHazard = false;
187 bool HasSMEMtoVectorWriteHazard = false;
188 bool HasInstFwdPrefetchBug = false;
189 bool HasVcmpxExecWARHazard = false;
190 bool HasLdsBranchVmemWARHazard = false;
191 bool HasNSAtoVMEMBug = false;
192 bool HasNSAClauseBug = false;
193 bool HasOffset3fBug = false;
194 bool HasFlatSegmentOffsetBug = false;
195 bool HasImageStoreD16Bug = false;
196 bool HasImageGather4D16Bug = false;
197 bool HasGFX11FullVGPRs = false;
198 bool HasMADIntraFwdBug = false;
199 bool HasVOPDInsts = false;
200 bool HasVALUTransUseHazard = false;
202 // Dummy feature to use for assembler in tablegen.
203 bool FeatureDisable = false;
205 SelectionDAGTargetInfo TSInfo;
207 SIInstrInfo InstrInfo;
208 SITargetLowering TLInfo;
209 SIFrameLowering FrameLowering;
212 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
213 const GCNTargetMachine &TM);
214 ~GCNSubtarget() override;
216 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
217 StringRef GPU, StringRef FS);
219 const SIInstrInfo *getInstrInfo() const override {
223 const SIFrameLowering *getFrameLowering() const override {
224 return &FrameLowering;
227 const SITargetLowering *getTargetLowering() const override {
231 const SIRegisterInfo *getRegisterInfo() const override {
232 return &InstrInfo.getRegisterInfo();
235 const CallLowering *getCallLowering() const override {
236 return CallLoweringInfo.get();
239 const InlineAsmLowering *getInlineAsmLowering() const override {
240 return InlineAsmLoweringInfo.get();
243 InstructionSelector *getInstructionSelector() const override {
244 return InstSelector.get();
247 const LegalizerInfo *getLegalizerInfo() const override {
248 return Legalizer.get();
251 const RegisterBankInfo *getRegBankInfo() const override {
252 return RegBankInfo.get();
255 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
259 // Nothing implemented, just prevent crashes on use.
260 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
264 const InstrItineraryData *getInstrItineraryData() const override {
268 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
270 Generation getGeneration() const {
271 return (Generation)Gen;
274 unsigned getMaxWaveScratchSize() const {
275 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
276 if (getGeneration() < GFX11) {
277 // 13-bit field in units of 256-dword.
278 return (256 * 4) * ((1 << 13) - 1);
280 // 15-bit field in units of 64-dword.
281 return (64 * 4) * ((1 << 15) - 1);
284 /// Return the number of high bits known to be zero for a frame index.
285 unsigned getKnownHighZeroBitsForFrameIndex() const {
286 return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
289 int getLDSBankCount() const {
293 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
294 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
297 unsigned getConstantBusLimit(unsigned Opcode) const;
299 /// Returns if the result of this instruction with a 16-bit result returned in
300 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
301 /// the original value.
302 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
304 bool supportsWGP() const { return getGeneration() >= GFX10; }
306 bool hasIntClamp() const {
310 bool hasFP64() const {
314 bool hasMIMG_R128() const {
318 bool hasHWFP64() const {
322 bool hasFastFMAF32() const {
326 bool hasHalfRate64Ops() const {
327 return HalfRate64Ops;
330 bool hasFullRate64Ops() const {
331 return FullRate64Ops;
334 bool hasAddr64() const {
335 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
338 bool hasFlat() const {
339 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
342 // Return true if the target only has the reverse operand versions of VALU
343 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
344 bool hasOnlyRevVALUShifts() const {
345 return getGeneration() >= VOLCANIC_ISLANDS;
348 bool hasFractBug() const {
349 return getGeneration() == SOUTHERN_ISLANDS;
352 bool hasBFE() const {
356 bool hasBFI() const {
360 bool hasBFM() const {
364 bool hasBCNT(unsigned Size) const {
368 bool hasFFBL() const {
372 bool hasFFBH() const {
376 bool hasMed3_16() const {
377 return getGeneration() >= AMDGPUSubtarget::GFX9;
380 bool hasMin3Max3_16() const {
381 return getGeneration() >= AMDGPUSubtarget::GFX9;
384 bool hasFmaMixInsts() const {
385 return HasFmaMixInsts;
388 bool hasCARRY() const {
392 bool hasFMA() const {
396 bool hasSwap() const {
400 bool hasScalarPackInsts() const {
404 bool hasScalarMulHiInsts() const {
408 TrapHandlerAbi getTrapHandlerAbi() const {
409 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
412 bool supportsGetDoorbellID() const {
413 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
414 return getGeneration() >= GFX9;
417 /// True if the offset field of DS instructions works as expected. On SI, the
418 /// offset uses a 16-bit adder and does not always wrap properly.
419 bool hasUsableDSOffset() const {
420 return getGeneration() >= SEA_ISLANDS;
423 bool unsafeDSOffsetFoldingEnabled() const {
424 return EnableUnsafeDSOffsetFolding;
427 /// Condition output from div_scale is usable.
428 bool hasUsableDivScaleConditionOutput() const {
429 return getGeneration() != SOUTHERN_ISLANDS;
432 /// Extra wait hazard is needed in some cases before
433 /// s_cbranch_vccnz/s_cbranch_vccz.
434 bool hasReadVCCZBug() const {
435 return getGeneration() <= SEA_ISLANDS;
438 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
439 bool partialVCCWritesUpdateVCCZ() const {
440 return getGeneration() >= GFX10;
443 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
444 /// was written by a VALU instruction.
445 bool hasSMRDReadVALUDefHazard() const {
446 return getGeneration() == SOUTHERN_ISLANDS;
449 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
450 /// SGPR was written by a VALU Instruction.
451 bool hasVMEMReadSGPRVALUDefHazard() const {
452 return getGeneration() >= VOLCANIC_ISLANDS;
455 bool hasRFEHazards() const {
456 return getGeneration() >= VOLCANIC_ISLANDS;
459 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
460 unsigned getSetRegWaitStates() const {
461 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
464 bool dumpCode() const {
468 /// Return the amount of LDS that can be used that will not restrict the
469 /// occupancy lower than WaveCount.
470 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
471 const Function &) const;
473 bool supportsMinMaxDenormModes() const {
474 return getGeneration() >= AMDGPUSubtarget::GFX9;
477 /// \returns If target supports S_DENORM_MODE.
478 bool hasDenormModeInst() const {
479 return getGeneration() >= AMDGPUSubtarget::GFX10;
482 bool useFlatForGlobal() const {
483 return FlatForGlobal;
486 /// \returns If target supports ds_read/write_b128 and user enables generation
487 /// of ds_read/write_b128.
488 bool useDS128() const {
489 return CIInsts && EnableDS128;
492 /// \return If target supports ds_read/write_b96/128.
493 bool hasDS96AndDS128() const {
497 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
498 bool haveRoundOpsF64() const {
502 /// \returns If MUBUF instructions always perform range checking, even for
503 /// buffer resources used for private memory access.
504 bool privateMemoryResourceIsRangeChecked() const {
505 return getGeneration() < AMDGPUSubtarget::GFX9;
508 /// \returns If target requires PRT Struct NULL support (zero result registers
509 /// for sparse texture support).
510 bool usePRTStrictNull() const {
511 return EnablePRTStrictNull;
514 bool hasAutoWaitcntBeforeBarrier() const {
515 return AutoWaitcntBeforeBarrier;
518 /// \returns true if the target supports backing off of s_barrier instructions
519 /// when an exception is raised.
520 bool supportsBackOffBarrier() const {
521 return BackOffBarrier;
524 bool hasUnalignedBufferAccess() const {
525 return UnalignedBufferAccess;
528 bool hasUnalignedBufferAccessEnabled() const {
529 return UnalignedBufferAccess && UnalignedAccessMode;
532 bool hasUnalignedDSAccess() const {
533 return UnalignedDSAccess;
536 bool hasUnalignedDSAccessEnabled() const {
537 return UnalignedDSAccess && UnalignedAccessMode;
540 bool hasUnalignedScratchAccess() const {
541 return UnalignedScratchAccess;
544 bool hasUnalignedAccessMode() const {
545 return UnalignedAccessMode;
548 bool hasApertureRegs() const {
549 return HasApertureRegs;
552 bool isTrapHandlerEnabled() const {
556 bool isXNACKEnabled() const {
557 return TargetID.isXnackOnOrAny();
560 bool isTgSplitEnabled() const {
561 return EnableTgSplit;
564 bool isCuModeEnabled() const {
568 bool hasFlatAddressSpace() const {
569 return FlatAddressSpace;
572 bool hasFlatScrRegister() const {
573 return hasFlatAddressSpace();
576 bool hasFlatInstOffsets() const {
577 return FlatInstOffsets;
580 bool hasFlatGlobalInsts() const {
581 return FlatGlobalInsts;
584 bool hasFlatScratchInsts() const {
585 return FlatScratchInsts;
588 // Check if target supports ST addressing mode with FLAT scratch instructions.
589 // The ST addressing mode means no registers are used, either VGPR or SGPR,
590 // but only immediate offset is swizzled and added to the FLAT scratch base.
591 bool hasFlatScratchSTMode() const {
592 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
595 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
597 bool hasScalarFlatScratchInsts() const {
598 return ScalarFlatScratchInsts;
601 bool enableFlatScratch() const {
602 return flatScratchIsArchitected() ||
603 (EnableFlatScratch && hasFlatScratchInsts());
606 bool hasGlobalAddTidInsts() const {
607 return GFX10_BEncoding;
610 bool hasAtomicCSub() const {
611 return GFX10_BEncoding;
614 bool hasMultiDwordFlatScratchAddressing() const {
615 return getGeneration() >= GFX9;
618 bool hasFlatSegmentOffsetBug() const {
619 return HasFlatSegmentOffsetBug;
622 bool hasFlatLgkmVMemCountInOrder() const {
623 return getGeneration() > GFX9;
626 bool hasD16LoadStore() const {
627 return getGeneration() >= GFX9;
630 bool d16PreservesUnusedBits() const {
631 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
634 bool hasD16Images() const {
635 return getGeneration() >= VOLCANIC_ISLANDS;
638 /// Return if most LDS instructions have an m0 use that require m0 to be
640 bool ldsRequiresM0Init() const {
641 return getGeneration() < GFX9;
644 // True if the hardware rewinds and replays GWS operations if a wave is
647 // If this is false, a GWS operation requires testing if a nack set the
648 // MEM_VIOL bit, and repeating if so.
649 bool hasGWSAutoReplay() const {
650 return getGeneration() >= GFX9;
653 /// \returns if target has ds_gws_sema_release_all instruction.
654 bool hasGWSSemaReleaseAll() const {
658 /// \returns true if the target has integer add/sub instructions that do not
659 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
660 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
662 bool hasAddNoCarry() const {
663 return AddNoCarryInsts;
666 bool hasUnpackedD16VMem() const {
667 return HasUnpackedD16VMem;
670 // Covers VS/PS/CS graphics shaders
671 bool isMesaGfxShader(const Function &F) const {
672 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
675 bool hasMad64_32() const {
676 return getGeneration() >= SEA_ISLANDS;
679 bool hasSDWAOmod() const {
683 bool hasSDWAScalar() const {
684 return HasSDWAScalar;
687 bool hasSDWASdst() const {
691 bool hasSDWAMac() const {
695 bool hasSDWAOutModsVOPC() const {
696 return HasSDWAOutModsVOPC;
699 bool hasDLInsts() const {
703 bool hasFmacF64Inst() const { return HasFmacF64Inst; }
705 bool hasDot1Insts() const {
709 bool hasDot2Insts() const {
713 bool hasDot3Insts() const {
717 bool hasDot4Insts() const {
721 bool hasDot5Insts() const {
725 bool hasDot6Insts() const {
729 bool hasDot7Insts() const {
733 bool hasDot8Insts() const {
737 bool hasDot9Insts() const {
741 bool hasMAIInsts() const {
745 bool hasFP8Insts() const {
749 bool hasPkFmacF16Inst() const {
750 return HasPkFmacF16Inst;
753 bool hasAtomicFaddInsts() const {
754 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
757 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
759 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
761 bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; }
763 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
765 bool hasNoSdstCMPX() const {
766 return HasNoSdstCMPX;
769 bool hasVscnt() const {
773 bool hasGetWaveIdInst() const {
774 return HasGetWaveIdInst;
777 bool hasSMemTimeInst() const {
778 return HasSMemTimeInst;
781 bool hasShaderCyclesRegister() const {
782 return HasShaderCyclesRegister;
785 bool hasVOP3Literal() const {
786 return HasVOP3Literal;
789 bool hasNoDataDepHazard() const {
790 return HasNoDataDepHazard;
793 bool vmemWriteNeedsExpWaitcnt() const {
794 return getGeneration() < SEA_ISLANDS;
797 bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
799 // Scratch is allocated in 256 dword per wave blocks for the entire
800 // wavefront. When viewed from the perspective of an arbitrary workitem, this
801 // is 4-byte aligned.
803 // Only 4-byte alignment is really needed to access anything. Transformations
804 // on the pointer value itself may rely on the alignment / known low bits of
805 // the pointer. Set this to something above the minimum to avoid needing
806 // dynamic realignment in common cases.
807 Align getStackAlignment() const { return Align(16); }
809 bool enableMachineScheduler() const override {
813 bool useAA() const override;
815 bool enableSubRegLiveness() const override {
819 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
820 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
823 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
825 // XXX - Why is this here if it isn't in the default pass set?
826 bool enableEarlyIfConversion() const override {
830 void overrideSchedPolicy(MachineSchedPolicy &Policy,
831 unsigned NumRegionInstrs) const override;
833 unsigned getMaxNumUserSGPRs() const {
837 bool hasSMemRealTime() const {
838 return HasSMemRealTime;
841 bool hasMovrel() const {
845 bool hasVGPRIndexMode() const {
846 return HasVGPRIndexMode;
849 bool useVGPRIndexMode() const;
851 bool hasScalarCompareEq64() const {
852 return getGeneration() >= VOLCANIC_ISLANDS;
855 bool hasScalarStores() const {
856 return HasScalarStores;
859 bool hasScalarAtomics() const {
860 return HasScalarAtomics;
863 bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
865 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
866 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
868 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
869 bool hasPermLane64() const { return getGeneration() >= GFX11; }
871 bool hasDPP() const {
875 bool hasDPPBroadcasts() const {
876 return HasDPP && getGeneration() < GFX10;
879 bool hasDPPWavefrontShifts() const {
880 return HasDPP && getGeneration() < GFX10;
883 bool hasDPP8() const {
887 bool has64BitDPP() const {
891 bool hasPackedFP32Ops() const {
892 return HasPackedFP32Ops;
895 bool hasFmaakFmamkF32Insts() const {
896 return getGeneration() >= GFX10 || hasGFX940Insts();
899 bool hasImageInsts() const {
900 return HasImageInsts;
903 bool hasExtendedImageInsts() const {
904 return HasExtendedImageInsts;
907 bool hasR128A16() const {
911 bool hasA16() const { return HasA16; }
913 bool hasG16() const { return HasG16; }
915 bool hasOffset3fBug() const {
916 return HasOffset3fBug;
919 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
921 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
923 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
925 bool hasNSAEncoding() const { return HasNSAEncoding; }
927 unsigned getNSAMaxSize() const { return NSAMaxSize; }
929 bool hasGFX10_AEncoding() const {
930 return GFX10_AEncoding;
933 bool hasGFX10_BEncoding() const {
934 return GFX10_BEncoding;
937 bool hasGFX10_3Insts() const {
941 bool hasMadF16() const;
943 bool hasMovB64() const { return GFX940Insts; }
945 bool hasLshlAddB64() const { return GFX940Insts; }
947 bool enableSIScheduler() const {
948 return EnableSIScheduler;
951 bool loadStoreOptEnabled() const {
952 return EnableLoadStoreOpt;
955 bool hasSGPRInitBug() const {
959 bool hasUserSGPRInit16Bug() const {
960 return UserSGPRInit16Bug && isWave32();
963 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
965 bool hasNegativeUnalignedScratchOffsetBug() const {
966 return NegativeUnalignedScratchOffsetBug;
969 bool hasMFMAInlineLiteralBug() const {
970 return HasMFMAInlineLiteralBug;
973 bool has12DWordStoreHazard() const {
974 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
977 // \returns true if the subtarget supports DWORDX3 load/store instructions.
978 bool hasDwordx3LoadStores() const {
982 bool hasReadM0MovRelInterpHazard() const {
983 return getGeneration() == AMDGPUSubtarget::GFX9;
986 bool hasReadM0SendMsgHazard() const {
987 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
988 getGeneration() <= AMDGPUSubtarget::GFX9;
991 bool hasReadM0LdsDmaHazard() const {
992 return getGeneration() == AMDGPUSubtarget::GFX9;
995 bool hasReadM0LdsDirectHazard() const {
996 return getGeneration() == AMDGPUSubtarget::GFX9;
999 bool hasVcmpxPermlaneHazard() const {
1000 return HasVcmpxPermlaneHazard;
1003 bool hasVMEMtoScalarWriteHazard() const {
1004 return HasVMEMtoScalarWriteHazard;
1007 bool hasSMEMtoVectorWriteHazard() const {
1008 return HasSMEMtoVectorWriteHazard;
1011 bool hasLDSMisalignedBug() const {
1012 return LDSMisalignedBug && !EnableCuMode;
1015 bool hasInstFwdPrefetchBug() const {
1016 return HasInstFwdPrefetchBug;
1019 bool hasVcmpxExecWARHazard() const {
1020 return HasVcmpxExecWARHazard;
1023 bool hasLdsBranchVmemWARHazard() const {
1024 return HasLdsBranchVmemWARHazard;
1027 // Shift amount of a 64 bit shift cannot be a highest allocated register
1028 // if also at the end of the allocation block.
1029 bool hasShift64HighRegBug() const {
1030 return GFX90AInsts && !GFX940Insts;
1033 // Has one cycle hazard on transcendental instruction feeding a
1034 // non transcendental VALU.
1035 bool hasTransForwardingHazard() const { return GFX940Insts; }
1037 // Has one cycle hazard on a VALU instruction partially writing dst with
1038 // a shift of result bits feeding another VALU instruction.
1039 bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1041 // Cannot use op_sel with v_dot instructions.
1042 bool hasDOTOpSelHazard() const { return GFX940Insts; }
1044 // Does not have HW interlocs for VALU writing and then reading SGPRs.
1045 bool hasVDecCoExecHazard() const {
1049 bool hasNSAtoVMEMBug() const {
1050 return HasNSAtoVMEMBug;
1053 bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1055 bool hasHardClauses() const { return getGeneration() >= GFX10; }
1057 bool hasGFX90AInsts() const { return GFX90AInsts; }
1059 bool hasFPAtomicToDenormModeHazard() const {
1060 return getGeneration() == GFX10;
1063 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1065 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1067 bool hasVALUPartialForwardingHazard() const {
1068 return getGeneration() >= GFX11;
1071 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1073 bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; }
1075 /// Return if operations acting on VGPR tuples require even alignment.
1076 bool needsAlignedVGPRs() const { return GFX90AInsts; }
1078 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1079 bool hasSPackHL() const { return GFX11Insts; }
1081 /// Return true if the target's EXP instruction has the COMPR flag, which
1082 /// affects the meaning of the EN (enable) bits.
1083 bool hasCompressedExport() const { return !GFX11Insts; }
1085 /// Return true if the target's EXP instruction supports the NULL export
1087 bool hasNullExportTarget() const { return !GFX11Insts; }
1089 bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; }
1091 bool hasVOPDInsts() const { return HasVOPDInsts; }
1093 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1095 /// Return true if the target has the S_DELAY_ALU instruction.
1096 bool hasDelayAlu() const { return GFX11Insts; }
1098 bool hasPackedTID() const { return HasPackedTID; }
1100 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1101 // hasGFX90AInsts is also true.
1102 bool hasGFX940Insts() const { return GFX940Insts; }
1104 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1106 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1108 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1110 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1112 /// Return occupancy for the given function. Used LDS and a number of
1113 /// registers if provided.
1114 /// Note, occupancy can be affected by the scratch allocation as well, but
1115 /// we do not have enough information to compute it.
1116 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1117 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1119 /// \returns true if the flat_scratch register should be initialized with the
1120 /// pointer to the wave's scratch memory rather than a size and offset.
1121 bool flatScratchIsPointer() const {
1122 return getGeneration() >= AMDGPUSubtarget::GFX9;
1125 /// \returns true if the flat_scratch register is initialized by the HW.
1126 /// In this case it is readonly.
1127 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1129 /// \returns true if the machine has merged shaders in which s0-s7 are
1130 /// reserved by the hardware and user SGPRs start at s8
1131 bool hasMergedShaders() const {
1132 return getGeneration() >= GFX9;
1135 // \returns true if the target supports the pre-NGG legacy geometry path.
1136 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1138 /// \returns SGPR allocation granularity supported by the subtarget.
1139 unsigned getSGPRAllocGranule() const {
1140 return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1143 /// \returns SGPR encoding granularity supported by the subtarget.
1144 unsigned getSGPREncodingGranule() const {
1145 return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1148 /// \returns Total number of SGPRs supported by the subtarget.
1149 unsigned getTotalNumSGPRs() const {
1150 return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1153 /// \returns Addressable number of SGPRs supported by the subtarget.
1154 unsigned getAddressableNumSGPRs() const {
1155 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1158 /// \returns Minimum number of SGPRs that meets the given number of waves per
1159 /// execution unit requirement supported by the subtarget.
1160 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1161 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1164 /// \returns Maximum number of SGPRs that meets the given number of waves per
1165 /// execution unit requirement supported by the subtarget.
1166 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1167 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1170 /// \returns Reserved number of SGPRs. This is common
1171 /// utility function called by MachineFunction and
1172 /// Function variants of getReservedNumSGPRs.
1173 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1174 /// \returns Reserved number of SGPRs for given machine function \p MF.
1175 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1177 /// \returns Reserved number of SGPRs for given function \p F.
1178 unsigned getReservedNumSGPRs(const Function &F) const;
1180 /// \returns max num SGPRs. This is the common utility
1181 /// function called by MachineFunction and Function
1182 /// variants of getMaxNumSGPRs.
1183 unsigned getBaseMaxNumSGPRs(const Function &F,
1184 std::pair<unsigned, unsigned> WavesPerEU,
1185 unsigned PreloadedSGPRs,
1186 unsigned ReservedNumSGPRs) const;
1188 /// \returns Maximum number of SGPRs that meets number of waves per execution
1189 /// unit requirement for function \p MF, or number of SGPRs explicitly
1190 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1192 /// \returns Value that meets number of waves per execution unit requirement
1193 /// if explicitly requested value cannot be converted to integer, violates
1194 /// subtarget's specifications, or does not meet number of waves per execution
1195 /// unit requirement.
1196 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1198 /// \returns Maximum number of SGPRs that meets number of waves per execution
1199 /// unit requirement for function \p F, or number of SGPRs explicitly
1200 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1202 /// \returns Value that meets number of waves per execution unit requirement
1203 /// if explicitly requested value cannot be converted to integer, violates
1204 /// subtarget's specifications, or does not meet number of waves per execution
1205 /// unit requirement.
1206 unsigned getMaxNumSGPRs(const Function &F) const;
1208 /// \returns VGPR allocation granularity supported by the subtarget.
1209 unsigned getVGPRAllocGranule() const {
1210 return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1213 /// \returns VGPR encoding granularity supported by the subtarget.
1214 unsigned getVGPREncodingGranule() const {
1215 return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1218 /// \returns Total number of VGPRs supported by the subtarget.
1219 unsigned getTotalNumVGPRs() const {
1220 return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1223 /// \returns Addressable number of VGPRs supported by the subtarget.
1224 unsigned getAddressableNumVGPRs() const {
1225 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1228 /// \returns the minimum number of VGPRs that will prevent achieving more than
1229 /// the specified number of waves \p WavesPerEU.
1230 unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1231 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1234 /// \returns the maximum number of VGPRs that can be used and still achieved
1235 /// at least the specified number of waves \p WavesPerEU.
1236 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1237 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1240 /// \returns max num VGPRs. This is the common utility function
1241 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1242 unsigned getBaseMaxNumVGPRs(const Function &F,
1243 std::pair<unsigned, unsigned> WavesPerEU) const;
1244 /// \returns Maximum number of VGPRs that meets number of waves per execution
1245 /// unit requirement for function \p F, or number of VGPRs explicitly
1246 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1248 /// \returns Value that meets number of waves per execution unit requirement
1249 /// if explicitly requested value cannot be converted to integer, violates
1250 /// subtarget's specifications, or does not meet number of waves per execution
1251 /// unit requirement.
1252 unsigned getMaxNumVGPRs(const Function &F) const;
1254 unsigned getMaxNumAGPRs(const Function &F) const {
1255 return getMaxNumVGPRs(F);
1258 /// \returns Maximum number of VGPRs that meets number of waves per execution
1259 /// unit requirement for function \p MF, or number of VGPRs explicitly
1260 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1262 /// \returns Value that meets number of waves per execution unit requirement
1263 /// if explicitly requested value cannot be converted to integer, violates
1264 /// subtarget's specifications, or does not meet number of waves per execution
1265 /// unit requirement.
1266 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1268 void getPostRAMutations(
1269 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1272 std::unique_ptr<ScheduleDAGMutation>
1273 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1275 bool isWave32() const {
1276 return getWavefrontSize() == 32;
1279 bool isWave64() const {
1280 return getWavefrontSize() == 64;
1283 const TargetRegisterClass *getBoolRC() const {
1284 return getRegisterInfo()->getBoolRC();
1287 /// \returns Maximum number of work groups per compute unit supported by the
1288 /// subtarget and limited by given \p FlatWorkGroupSize.
1289 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1290 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1293 /// \returns Minimum flat work group size supported by the subtarget.
1294 unsigned getMinFlatWorkGroupSize() const override {
1295 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1298 /// \returns Maximum flat work group size supported by the subtarget.
1299 unsigned getMaxFlatWorkGroupSize() const override {
1300 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1303 /// \returns Number of waves per execution unit required to support the given
1304 /// \p FlatWorkGroupSize.
1306 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1307 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1310 /// \returns Minimum number of waves per execution unit supported by the
1312 unsigned getMinWavesPerEU() const override {
1313 return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1316 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1317 SDep &Dep) const override;
1319 // \returns true if it's beneficial on this subtarget for the scheduler to
1320 // cluster stores as well as loads.
1321 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1323 // \returns the number of address arguments from which to enable MIMG NSA
1324 // on supported architectures.
1325 unsigned getNSAThreshold(const MachineFunction &MF) const;
1328 } // end namespace llvm
1330 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H