1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/CodeGen/MachineScheduler.h"
19 #include "llvm/IR/MDBuilder.h"
20 #include "llvm/Target/TargetFrameLowering.h"
25 #define DEBUG_TYPE "amdgpu-subtarget"
27 #define GET_SUBTARGETINFO_TARGET_DESC
28 #define GET_SUBTARGETINFO_CTOR
29 #include "AMDGPUGenSubtargetInfo.inc"
31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
35 StringRef GPU, StringRef FS) {
36 // Determine default and user-specified characteristics
37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
38 // enabled, but some instructions do not respect them and they run at the
39 // double precision rate, so don't enable by default.
41 // We want to be able to turn these off, but making this a subtarget feature
42 // for SI has the unhelpful behavior that it unsets everything else if you
45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
51 ParseSubtargetFeatures(GPU, FullFS);
53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
55 // variants of MUBUF instructions.
56 if (!hasAddr64() && !FS.contains("flat-for-global")) {
60 // FIXME: I don't think think Evergreen has any useful support for
61 // denormals, but should be checked. Should we issue a warning somewhere
62 // if someone tries to enable these?
63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
64 FP64FP16Denormals = false;
65 FP32Denormals = false;
68 // Set defaults if needed.
69 if (MaxPrivateElementSize == 0)
70 MaxPrivateElementSize = 4;
75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
76 const TargetMachine &TM)
77 : AMDGPUGenSubtargetInfo(TT, GPU, FS),
79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
80 IsaVersion(ISAVersion0_0_0),
84 MaxPrivateElementSize(0),
90 FP64FP16Denormals(false),
94 AutoWaitcntBeforeBarrier(false),
95 UnalignedScratchAccess(false),
96 UnalignedBufferAccess(false),
98 HasApertureRegs(false),
101 DebuggerInsertNops(false),
102 DebuggerReserveRegs(false),
103 DebuggerEmitPrologue(false),
105 EnableVGPRSpilling(false),
106 EnablePromoteAlloca(false),
107 EnableLoadStoreOpt(false),
108 EnableUnsafeDSOffsetFolding(false),
109 EnableSIScheduler(false),
119 HasSMemRealTime(false),
120 Has16BitInsts(false),
121 HasVOP3PInsts(false),
123 HasVGPRIndexMode(false),
124 HasScalarStores(false),
125 HasInv2PiInlineImm(false),
128 FlatAddressSpace(false),
129 FlatInstOffsets(false),
130 FlatGlobalInsts(false),
131 FlatScratchInsts(false),
136 HasVertexCache(false),
138 ScalarizeGlobal(false),
140 FeatureDisable(false),
141 InstrItins(getInstrItineraryForCPU(GPU)) {
142 AS = AMDGPU::getAMDGPUAS(TT);
143 initializeSubtargetDependencies(TT, GPU, FS);
146 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
147 const Function &F) const {
149 return getLocalMemorySize();
150 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
151 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
152 unsigned MaxWaves = getMaxWavesPerEU();
153 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
156 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
157 const Function &F) const {
158 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
159 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
160 unsigned MaxWaves = getMaxWavesPerEU();
161 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
162 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
163 NumWaves = std::min(NumWaves, MaxWaves);
164 NumWaves = std::max(NumWaves, 1u);
168 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
169 const Function &F) const {
170 // Default minimum/maximum flat work group sizes.
171 std::pair<unsigned, unsigned> Default =
172 AMDGPU::isCompute(F.getCallingConv()) ?
173 std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
174 getWavefrontSize() * 4) :
175 std::pair<unsigned, unsigned>(1, getWavefrontSize());
177 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
178 // starts using "amdgpu-flat-work-group-size" attribute.
179 Default.second = AMDGPU::getIntegerAttribute(
180 F, "amdgpu-max-work-group-size", Default.second);
181 Default.first = std::min(Default.first, Default.second);
183 // Requested minimum/maximum flat work group sizes.
184 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
185 F, "amdgpu-flat-work-group-size", Default);
187 // Make sure requested minimum is less than requested maximum.
188 if (Requested.first > Requested.second)
191 // Make sure requested values do not violate subtarget's specifications.
192 if (Requested.first < getMinFlatWorkGroupSize())
194 if (Requested.second > getMaxFlatWorkGroupSize())
200 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
201 const Function &F) const {
202 // Default minimum/maximum number of waves per execution unit.
203 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
205 // Default/requested minimum/maximum flat work group sizes.
206 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
208 // If minimum/maximum flat work group sizes were explicitly requested using
209 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
210 // number of waves per execution unit to values implied by requested
211 // minimum/maximum flat work group sizes.
212 unsigned MinImpliedByFlatWorkGroupSize =
213 getMaxWavesPerEU(FlatWorkGroupSizes.second);
214 bool RequestedFlatWorkGroupSize = false;
216 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
217 // starts using "amdgpu-flat-work-group-size" attribute.
218 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
219 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
220 Default.first = MinImpliedByFlatWorkGroupSize;
221 RequestedFlatWorkGroupSize = true;
224 // Requested minimum/maximum number of waves per execution unit.
225 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
226 F, "amdgpu-waves-per-eu", Default, true);
228 // Make sure requested minimum is less than requested maximum.
229 if (Requested.second && Requested.first > Requested.second)
232 // Make sure requested values do not violate subtarget's specifications.
233 if (Requested.first < getMinWavesPerEU() ||
234 Requested.first > getMaxWavesPerEU())
236 if (Requested.second > getMaxWavesPerEU())
239 // Make sure requested values are compatible with values implied by requested
240 // minimum/maximum flat work group sizes.
241 if (RequestedFlatWorkGroupSize &&
242 Requested.first > MinImpliedByFlatWorkGroupSize)
248 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
249 Function *Kernel = I->getParent()->getParent();
250 unsigned MinSize = 0;
251 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
252 bool IdQuery = false;
254 // If reqd_work_group_size is present it narrows value down.
255 if (auto *CI = dyn_cast<CallInst>(I)) {
256 const Function *F = CI->getCalledFunction();
258 unsigned Dim = UINT_MAX;
259 switch (F->getIntrinsicID()) {
260 case Intrinsic::amdgcn_workitem_id_x:
261 case Intrinsic::r600_read_tidig_x:
263 case Intrinsic::r600_read_local_size_x:
266 case Intrinsic::amdgcn_workitem_id_y:
267 case Intrinsic::r600_read_tidig_y:
269 case Intrinsic::r600_read_local_size_y:
272 case Intrinsic::amdgcn_workitem_id_z:
273 case Intrinsic::r600_read_tidig_z:
275 case Intrinsic::r600_read_local_size_z:
282 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
283 if (Node->getNumOperands() == 3)
284 MinSize = MaxSize = mdconst::extract<ConstantInt>(
285 Node->getOperand(Dim))->getZExtValue();
293 // Range metadata is [Lo, Hi). For ID query we need to pass max size
294 // as Hi. For size query we need to pass Hi + 1.
300 MDBuilder MDB(I->getContext());
301 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
303 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
307 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
308 const TargetMachine &TM) :
309 AMDGPUSubtarget(TT, GPU, FS, TM),
311 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
314 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
315 const TargetMachine &TM) :
316 AMDGPUSubtarget(TT, GPU, FS, TM),
318 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
321 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
322 unsigned NumRegionInstrs) const {
323 // Track register pressure so the scheduler can try to decrease
324 // pressure once register usage is above the threshold defined by
325 // SIRegisterInfo::getRegPressureSetLimit()
326 Policy.ShouldTrackPressure = true;
328 // Enabling both top down and bottom up scheduling seems to give us less
329 // register spills than just using one of these approaches on its own.
330 Policy.OnlyTopDown = false;
331 Policy.OnlyBottomUp = false;
333 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
334 if (!enableSIScheduler())
335 Policy.ShouldTrackLaneMasks = true;
338 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
339 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
342 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
343 unsigned ExplicitArgBytes) const {
344 unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
345 if (ImplicitBytes == 0)
346 return ExplicitArgBytes;
348 unsigned Alignment = getAlignmentForImplicitArgPtr();
349 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
352 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
353 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
375 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
397 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
398 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
399 if (MFI.hasFlatScratchInit()) {
400 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
401 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
402 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
403 return 4; // FLAT_SCRATCH, VCC (in that order).
406 if (isXNACKEnabled())
407 return 4; // XNACK, VCC (in that order).
411 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
412 const Function &F = *MF.getFunction();
413 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
415 // Compute maximum number of SGPRs function can use using default/requested
416 // minimum number of waves per execution unit.
417 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
418 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
419 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
421 // Check if maximum number of SGPRs was explicitly requested using
422 // "amdgpu-num-sgpr" attribute.
423 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
424 unsigned Requested = AMDGPU::getIntegerAttribute(
425 F, "amdgpu-num-sgpr", MaxNumSGPRs);
427 // Make sure requested value does not violate subtarget's specifications.
428 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
431 // If more SGPRs are required to support the input user/system SGPRs,
432 // increase to accommodate them.
434 // FIXME: This really ends up using the requested number of SGPRs + number
435 // of reserved special registers in total. Theoretically you could re-use
436 // the last input registers for these special registers, but this would
437 // require a lot of complexity to deal with the weird aliasing.
438 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
439 if (Requested && Requested < InputNumSGPRs)
440 Requested = InputNumSGPRs;
442 // Make sure requested value is compatible with values implied by
443 // default/requested minimum/maximum number of waves per execution unit.
444 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
446 if (WavesPerEU.second &&
447 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
451 MaxNumSGPRs = Requested;
454 if (hasSGPRInitBug())
455 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
457 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
458 MaxAddressableNumSGPRs);
461 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
462 const Function &F = *MF.getFunction();
463 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
465 // Compute maximum number of VGPRs function can use using default/requested
466 // minimum number of waves per execution unit.
467 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
468 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
470 // Check if maximum number of VGPRs was explicitly requested using
471 // "amdgpu-num-vgpr" attribute.
472 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
473 unsigned Requested = AMDGPU::getIntegerAttribute(
474 F, "amdgpu-num-vgpr", MaxNumVGPRs);
476 // Make sure requested value does not violate subtarget's specifications.
477 if (Requested && Requested <= getReservedNumVGPRs(MF))
480 // Make sure requested value is compatible with values implied by
481 // default/requested minimum/maximum number of waves per execution unit.
482 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
484 if (WavesPerEU.second &&
485 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
489 MaxNumVGPRs = Requested;
492 return MaxNumVGPRs - getReservedNumVGPRs(MF);