1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUSubtarget.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/CodeGen/MachineScheduler.h"
18 #include "llvm/Target/TargetFrameLowering.h"
23 #define DEBUG_TYPE "amdgpu-subtarget"
25 #define GET_SUBTARGETINFO_ENUM
26 #define GET_SUBTARGETINFO_TARGET_DESC
27 #define GET_SUBTARGETINFO_CTOR
28 #include "AMDGPUGenSubtargetInfo.inc"
30 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
34 StringRef GPU, StringRef FS) {
35 // Determine default and user-specified characteristics
36 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
37 // enabled, but some instructions do not respect them and they run at the
38 // double precision rate, so don't enable by default.
40 // We want to be able to turn these off, but making this a subtarget feature
41 // for SI has the unhelpful behavior that it unsets everything else if you
44 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
45 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
46 FullFS += "+flat-for-global,+unaligned-buffer-access,";
49 ParseSubtargetFeatures(GPU, FullFS);
51 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
52 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
53 // variants of MUBUF instructions.
54 if (!hasAddr64() && !FS.contains("flat-for-global")) {
58 // FIXME: I don't think think Evergreen has any useful support for
59 // denormals, but should be checked. Should we issue a warning somewhere
60 // if someone tries to enable these?
61 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
62 FP16Denormals = false;
63 FP32Denormals = false;
64 FP64Denormals = false;
67 // Set defaults if needed.
68 if (MaxPrivateElementSize == 0)
69 MaxPrivateElementSize = 4;
74 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
75 const TargetMachine &TM)
76 : AMDGPUGenSubtargetInfo(TT, GPU, FS),
78 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
79 IsaVersion(ISAVersion0_0_0),
83 MaxPrivateElementSize(0),
93 UnalignedScratchAccess(false),
94 UnalignedBufferAccess(false),
97 DebuggerInsertNops(false),
98 DebuggerReserveRegs(false),
99 DebuggerEmitPrologue(false),
101 EnableVGPRSpilling(false),
102 EnablePromoteAlloca(false),
103 EnableLoadStoreOpt(false),
104 EnableUnsafeDSOffsetFolding(false),
105 EnableSIScheduler(false),
114 HasSMemRealTime(false),
115 Has16BitInsts(false),
117 HasVGPRIndexMode(false),
118 HasScalarStores(false),
119 HasInv2PiInlineImm(false),
120 FlatAddressSpace(false),
125 HasVertexCache(false),
127 ScalarizeGlobal(false),
129 FeatureDisable(false),
130 InstrItins(getInstrItineraryForCPU(GPU)) {
131 initializeSubtargetDependencies(TT, GPU, FS);
134 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
136 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
157 return getLocalMemorySize();
161 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
192 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
193 const Function &F) const {
194 // Default minimum/maximum flat work group sizes.
195 std::pair<unsigned, unsigned> Default =
196 AMDGPU::isCompute(F.getCallingConv()) ?
197 std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
198 getWavefrontSize() * 4) :
199 std::pair<unsigned, unsigned>(1, getWavefrontSize());
201 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
202 // starts using "amdgpu-flat-work-group-size" attribute.
203 Default.second = AMDGPU::getIntegerAttribute(
204 F, "amdgpu-max-work-group-size", Default.second);
205 Default.first = std::min(Default.first, Default.second);
207 // Requested minimum/maximum flat work group sizes.
208 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
209 F, "amdgpu-flat-work-group-size", Default);
211 // Make sure requested minimum is less than requested maximum.
212 if (Requested.first > Requested.second)
215 // Make sure requested values do not violate subtarget's specifications.
216 if (Requested.first < getMinFlatWorkGroupSize())
218 if (Requested.second > getMaxFlatWorkGroupSize())
224 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
225 const Function &F) const {
226 // Default minimum/maximum number of waves per execution unit.
227 std::pair<unsigned, unsigned> Default(1, 0);
229 // Default/requested minimum/maximum flat work group sizes.
230 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
232 // If minimum/maximum flat work group sizes were explicitly requested using
233 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
234 // number of waves per execution unit to values implied by requested
235 // minimum/maximum flat work group sizes.
236 unsigned MinImpliedByFlatWorkGroupSize =
237 getMaxWavesPerEU(FlatWorkGroupSizes.second);
238 bool RequestedFlatWorkGroupSize = false;
240 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
241 // starts using "amdgpu-flat-work-group-size" attribute.
242 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
243 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
244 Default.first = MinImpliedByFlatWorkGroupSize;
245 RequestedFlatWorkGroupSize = true;
248 // Requested minimum/maximum number of waves per execution unit.
249 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
250 F, "amdgpu-waves-per-eu", Default, true);
252 // Make sure requested minimum is less than requested maximum.
253 if (Requested.second && Requested.first > Requested.second)
256 // Make sure requested values do not violate subtarget's specifications.
257 if (Requested.first < getMinWavesPerEU() ||
258 Requested.first > getMaxWavesPerEU())
260 if (Requested.second > getMaxWavesPerEU())
263 // Make sure requested values are compatible with values implied by requested
264 // minimum/maximum flat work group sizes.
265 if (RequestedFlatWorkGroupSize &&
266 Requested.first > MinImpliedByFlatWorkGroupSize)
272 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
273 const TargetMachine &TM) :
274 AMDGPUSubtarget(TT, GPU, FS, TM),
276 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
279 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
280 const TargetMachine &TM) :
281 AMDGPUSubtarget(TT, GPU, FS, TM),
283 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
286 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
287 unsigned NumRegionInstrs) const {
288 // Track register pressure so the scheduler can try to decrease
289 // pressure once register usage is above the threshold defined by
290 // SIRegisterInfo::getRegPressureSetLimit()
291 Policy.ShouldTrackPressure = true;
293 // Enabling both top down and bottom up scheduling seems to give us less
294 // register spills than just using one of these approaches on its own.
295 Policy.OnlyTopDown = false;
296 Policy.OnlyBottomUp = false;
298 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
299 if (!enableSIScheduler())
300 Policy.ShouldTrackLaneMasks = true;
303 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
304 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
307 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
308 unsigned ExplicitArgBytes) const {
309 unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
310 if (ImplicitBytes == 0)
311 return ExplicitArgBytes;
313 unsigned Alignment = getAlignmentForImplicitArgPtr();
314 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
317 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
318 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
340 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
362 unsigned SISubtarget::getMaxNumSGPRs() const {
363 if (hasSGPRInitBug())
364 return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
366 if (getGeneration() >= VOLCANIC_ISLANDS)