1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/CodeGen/MachineScheduler.h"
19 #include "llvm/IR/MDBuilder.h"
20 #include "llvm/Target/TargetFrameLowering.h"
25 #define DEBUG_TYPE "amdgpu-subtarget"
27 #define GET_SUBTARGETINFO_TARGET_DESC
28 #define GET_SUBTARGETINFO_CTOR
29 #include "AMDGPUGenSubtargetInfo.inc"
31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
35 StringRef GPU, StringRef FS) {
36 // Determine default and user-specified characteristics
37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
38 // enabled, but some instructions do not respect them and they run at the
39 // double precision rate, so don't enable by default.
41 // We want to be able to turn these off, but making this a subtarget feature
42 // for SI has the unhelpful behavior that it unsets everything else if you
45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
51 ParseSubtargetFeatures(GPU, FullFS);
53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
55 // variants of MUBUF instructions.
56 if (!hasAddr64() && !FS.contains("flat-for-global")) {
60 // FIXME: I don't think think Evergreen has any useful support for
61 // denormals, but should be checked. Should we issue a warning somewhere
62 // if someone tries to enable these?
63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
64 FP64FP16Denormals = false;
65 FP32Denormals = false;
68 // Set defaults if needed.
69 if (MaxPrivateElementSize == 0)
70 MaxPrivateElementSize = 4;
75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
76 const TargetMachine &TM)
77 : AMDGPUGenSubtargetInfo(TT, GPU, FS),
79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
80 IsaVersion(ISAVersion0_0_0),
84 MaxPrivateElementSize(0),
90 FP64FP16Denormals(false),
94 UnalignedScratchAccess(false),
95 UnalignedBufferAccess(false),
97 HasApertureRegs(false),
100 DebuggerInsertNops(false),
101 DebuggerReserveRegs(false),
102 DebuggerEmitPrologue(false),
104 EnableVGPRSpilling(false),
105 EnablePromoteAlloca(false),
106 EnableLoadStoreOpt(false),
107 EnableUnsafeDSOffsetFolding(false),
108 EnableSIScheduler(false),
118 HasSMemRealTime(false),
119 Has16BitInsts(false),
120 HasVOP3PInsts(false),
122 HasVGPRIndexMode(false),
123 HasScalarStores(false),
124 HasInv2PiInlineImm(false),
127 FlatAddressSpace(false),
132 HasVertexCache(false),
134 ScalarizeGlobal(false),
136 FeatureDisable(false),
137 InstrItins(getInstrItineraryForCPU(GPU)) {
138 AS = AMDGPU::getAMDGPUAS(TT);
139 initializeSubtargetDependencies(TT, GPU, FS);
142 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
143 const Function &F) const {
145 return getLocalMemorySize();
146 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
147 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
148 unsigned MaxWaves = getMaxWavesPerEU();
149 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
152 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
153 const Function &F) const {
154 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
155 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
156 unsigned MaxWaves = getMaxWavesPerEU();
157 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
158 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
159 NumWaves = std::min(NumWaves, MaxWaves);
160 NumWaves = std::max(NumWaves, 1u);
164 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
165 const Function &F) const {
166 // Default minimum/maximum flat work group sizes.
167 std::pair<unsigned, unsigned> Default =
168 AMDGPU::isCompute(F.getCallingConv()) ?
169 std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
170 getWavefrontSize() * 4) :
171 std::pair<unsigned, unsigned>(1, getWavefrontSize());
173 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
174 // starts using "amdgpu-flat-work-group-size" attribute.
175 Default.second = AMDGPU::getIntegerAttribute(
176 F, "amdgpu-max-work-group-size", Default.second);
177 Default.first = std::min(Default.first, Default.second);
179 // Requested minimum/maximum flat work group sizes.
180 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
181 F, "amdgpu-flat-work-group-size", Default);
183 // Make sure requested minimum is less than requested maximum.
184 if (Requested.first > Requested.second)
187 // Make sure requested values do not violate subtarget's specifications.
188 if (Requested.first < getMinFlatWorkGroupSize())
190 if (Requested.second > getMaxFlatWorkGroupSize())
196 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
197 const Function &F) const {
198 // Default minimum/maximum number of waves per execution unit.
199 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
201 // Default/requested minimum/maximum flat work group sizes.
202 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
204 // If minimum/maximum flat work group sizes were explicitly requested using
205 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
206 // number of waves per execution unit to values implied by requested
207 // minimum/maximum flat work group sizes.
208 unsigned MinImpliedByFlatWorkGroupSize =
209 getMaxWavesPerEU(FlatWorkGroupSizes.second);
210 bool RequestedFlatWorkGroupSize = false;
212 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
213 // starts using "amdgpu-flat-work-group-size" attribute.
214 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
215 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
216 Default.first = MinImpliedByFlatWorkGroupSize;
217 RequestedFlatWorkGroupSize = true;
220 // Requested minimum/maximum number of waves per execution unit.
221 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
222 F, "amdgpu-waves-per-eu", Default, true);
224 // Make sure requested minimum is less than requested maximum.
225 if (Requested.second && Requested.first > Requested.second)
228 // Make sure requested values do not violate subtarget's specifications.
229 if (Requested.first < getMinWavesPerEU() ||
230 Requested.first > getMaxWavesPerEU())
232 if (Requested.second > getMaxWavesPerEU())
235 // Make sure requested values are compatible with values implied by requested
236 // minimum/maximum flat work group sizes.
237 if (RequestedFlatWorkGroupSize &&
238 Requested.first > MinImpliedByFlatWorkGroupSize)
244 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
245 Function *Kernel = I->getParent()->getParent();
246 unsigned MinSize = 0;
247 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
248 bool IdQuery = false;
250 // If reqd_work_group_size is present it narrows value down.
251 if (auto *CI = dyn_cast<CallInst>(I)) {
252 const Function *F = CI->getCalledFunction();
254 unsigned Dim = UINT_MAX;
255 switch (F->getIntrinsicID()) {
256 case Intrinsic::amdgcn_workitem_id_x:
257 case Intrinsic::r600_read_tidig_x:
259 case Intrinsic::r600_read_local_size_x:
262 case Intrinsic::amdgcn_workitem_id_y:
263 case Intrinsic::r600_read_tidig_y:
265 case Intrinsic::r600_read_local_size_y:
268 case Intrinsic::amdgcn_workitem_id_z:
269 case Intrinsic::r600_read_tidig_z:
271 case Intrinsic::r600_read_local_size_z:
278 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
279 if (Node->getNumOperands() == 3)
280 MinSize = MaxSize = mdconst::extract<ConstantInt>(
281 Node->getOperand(Dim))->getZExtValue();
289 // Range metadata is [Lo, Hi). For ID query we need to pass max size
290 // as Hi. For size query we need to pass Hi + 1.
296 MDBuilder MDB(I->getContext());
297 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
299 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
303 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
304 const TargetMachine &TM) :
305 AMDGPUSubtarget(TT, GPU, FS, TM),
307 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
310 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
311 const TargetMachine &TM) :
312 AMDGPUSubtarget(TT, GPU, FS, TM),
314 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
317 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
318 unsigned NumRegionInstrs) const {
319 // Track register pressure so the scheduler can try to decrease
320 // pressure once register usage is above the threshold defined by
321 // SIRegisterInfo::getRegPressureSetLimit()
322 Policy.ShouldTrackPressure = true;
324 // Enabling both top down and bottom up scheduling seems to give us less
325 // register spills than just using one of these approaches on its own.
326 Policy.OnlyTopDown = false;
327 Policy.OnlyBottomUp = false;
329 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
330 if (!enableSIScheduler())
331 Policy.ShouldTrackLaneMasks = true;
334 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
335 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
338 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
339 unsigned ExplicitArgBytes) const {
340 unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
341 if (ImplicitBytes == 0)
342 return ExplicitArgBytes;
344 unsigned Alignment = getAlignmentForImplicitArgPtr();
345 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
348 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
349 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
371 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
393 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
394 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
395 if (MFI.hasFlatScratchInit()) {
396 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
397 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
398 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
399 return 4; // FLAT_SCRATCH, VCC (in that order).
402 if (isXNACKEnabled())
403 return 4; // XNACK, VCC (in that order).
407 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
408 const Function &F = *MF.getFunction();
409 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
411 // Compute maximum number of SGPRs function can use using default/requested
412 // minimum number of waves per execution unit.
413 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
414 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
415 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
417 // Check if maximum number of SGPRs was explicitly requested using
418 // "amdgpu-num-sgpr" attribute.
419 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
420 unsigned Requested = AMDGPU::getIntegerAttribute(
421 F, "amdgpu-num-sgpr", MaxNumSGPRs);
423 // Make sure requested value does not violate subtarget's specifications.
424 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
427 // If more SGPRs are required to support the input user/system SGPRs,
428 // increase to accommodate them.
430 // FIXME: This really ends up using the requested number of SGPRs + number
431 // of reserved special registers in total. Theoretically you could re-use
432 // the last input registers for these special registers, but this would
433 // require a lot of complexity to deal with the weird aliasing.
434 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
435 if (Requested && Requested < InputNumSGPRs)
436 Requested = InputNumSGPRs;
438 // Make sure requested value is compatible with values implied by
439 // default/requested minimum/maximum number of waves per execution unit.
440 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
442 if (WavesPerEU.second &&
443 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
447 MaxNumSGPRs = Requested;
450 if (hasSGPRInitBug())
451 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
453 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
454 MaxAddressableNumSGPRs);
457 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
458 const Function &F = *MF.getFunction();
459 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
461 // Compute maximum number of VGPRs function can use using default/requested
462 // minimum number of waves per execution unit.
463 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
464 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
466 // Check if maximum number of VGPRs was explicitly requested using
467 // "amdgpu-num-vgpr" attribute.
468 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
469 unsigned Requested = AMDGPU::getIntegerAttribute(
470 F, "amdgpu-num-vgpr", MaxNumVGPRs);
472 // Make sure requested value does not violate subtarget's specifications.
473 if (Requested && Requested <= getReservedNumVGPRs(MF))
476 // Make sure requested value is compatible with values implied by
477 // default/requested minimum/maximum number of waves per execution unit.
478 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
480 if (WavesPerEU.second &&
481 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
485 MaxNumVGPRs = Requested;
488 return MaxNumVGPRs - getReservedNumVGPRs(MF);