1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #ifdef LLVM_BUILD_GLOBAL_ISEL
19 #include "AMDGPUCallLowering.h"
20 #include "AMDGPUInstructionSelector.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPURegisterBankInfo.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/ADT/SmallString.h"
26 #include "llvm/CodeGen/MachineScheduler.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/Target/TargetFrameLowering.h"
33 #define DEBUG_TYPE "amdgpu-subtarget"
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #include "AMDGPUGenSubtargetInfo.inc"
39 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
42 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
43 StringRef GPU, StringRef FS) {
44 // Determine default and user-specified characteristics
45 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
46 // enabled, but some instructions do not respect them and they run at the
47 // double precision rate, so don't enable by default.
49 // We want to be able to turn these off, but making this a subtarget feature
50 // for SI has the unhelpful behavior that it unsets everything else if you
53 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
54 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
55 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
59 ParseSubtargetFeatures(GPU, FullFS);
61 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
62 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
63 // variants of MUBUF instructions.
64 if (!hasAddr64() && !FS.contains("flat-for-global")) {
68 // FIXME: I don't think think Evergreen has any useful support for
69 // denormals, but should be checked. Should we issue a warning somewhere
70 // if someone tries to enable these?
71 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
72 FP64FP16Denormals = false;
73 FP32Denormals = false;
76 // Set defaults if needed.
77 if (MaxPrivateElementSize == 0)
78 MaxPrivateElementSize = 4;
83 #ifdef LLVM_BUILD_GLOBAL_ISEL
86 struct SIGISelActualAccessor : public GISelAccessor {
87 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
88 std::unique_ptr<InstructionSelector> InstSelector;
89 std::unique_ptr<LegalizerInfo> Legalizer;
90 std::unique_ptr<RegisterBankInfo> RegBankInfo;
91 const AMDGPUCallLowering *getCallLowering() const override {
92 return CallLoweringInfo.get();
94 const InstructionSelector *getInstructionSelector() const override {
95 return InstSelector.get();
97 const LegalizerInfo *getLegalizerInfo() const override {
98 return Legalizer.get();
100 const RegisterBankInfo *getRegBankInfo() const override {
101 return RegBankInfo.get();
105 } // end anonymous namespace
108 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
109 const TargetMachine &TM)
110 : AMDGPUGenSubtargetInfo(TT, GPU, FS),
112 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
113 IsaVersion(ISAVersion0_0_0),
117 MaxPrivateElementSize(0),
120 HalfRate64Ops(false),
122 FP32Denormals(false),
123 FP64FP16Denormals(false),
126 FlatForGlobal(false),
127 AutoWaitcntBeforeBarrier(false),
128 UnalignedScratchAccess(false),
129 UnalignedBufferAccess(false),
131 HasApertureRegs(false),
134 DebuggerInsertNops(false),
135 DebuggerReserveRegs(false),
136 DebuggerEmitPrologue(false),
138 EnableVGPRSpilling(false),
139 EnablePromoteAlloca(false),
140 EnableLoadStoreOpt(false),
141 EnableUnsafeDSOffsetFolding(false),
142 EnableSIScheduler(false),
152 HasSMemRealTime(false),
153 Has16BitInsts(false),
154 HasVOP3PInsts(false),
156 HasVGPRIndexMode(false),
157 HasScalarStores(false),
158 HasInv2PiInlineImm(false),
161 HasSDWAScalar(false),
164 HasSDWAOutModsVOPC(false),
166 FlatAddressSpace(false),
167 FlatInstOffsets(false),
168 FlatGlobalInsts(false),
169 FlatScratchInsts(false),
174 HasVertexCache(false),
176 ScalarizeGlobal(false),
178 FeatureDisable(false),
179 InstrItins(getInstrItineraryForCPU(GPU)) {
180 AS = AMDGPU::getAMDGPUAS(TT);
181 initializeSubtargetDependencies(TT, GPU, FS);
184 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
185 const Function &F) const {
187 return getLocalMemorySize();
188 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
189 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
190 unsigned MaxWaves = getMaxWavesPerEU();
191 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
194 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
195 const Function &F) const {
196 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
197 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
198 unsigned MaxWaves = getMaxWavesPerEU();
199 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
200 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
201 NumWaves = std::min(NumWaves, MaxWaves);
202 NumWaves = std::max(NumWaves, 1u);
206 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
207 const Function &F) const {
208 // Default minimum/maximum flat work group sizes.
209 std::pair<unsigned, unsigned> Default =
210 AMDGPU::isCompute(F.getCallingConv()) ?
211 std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
212 getWavefrontSize() * 4) :
213 std::pair<unsigned, unsigned>(1, getWavefrontSize());
215 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
216 // starts using "amdgpu-flat-work-group-size" attribute.
217 Default.second = AMDGPU::getIntegerAttribute(
218 F, "amdgpu-max-work-group-size", Default.second);
219 Default.first = std::min(Default.first, Default.second);
221 // Requested minimum/maximum flat work group sizes.
222 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
223 F, "amdgpu-flat-work-group-size", Default);
225 // Make sure requested minimum is less than requested maximum.
226 if (Requested.first > Requested.second)
229 // Make sure requested values do not violate subtarget's specifications.
230 if (Requested.first < getMinFlatWorkGroupSize())
232 if (Requested.second > getMaxFlatWorkGroupSize())
238 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
239 const Function &F) const {
240 // Default minimum/maximum number of waves per execution unit.
241 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
243 // Default/requested minimum/maximum flat work group sizes.
244 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
246 // If minimum/maximum flat work group sizes were explicitly requested using
247 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
248 // number of waves per execution unit to values implied by requested
249 // minimum/maximum flat work group sizes.
250 unsigned MinImpliedByFlatWorkGroupSize =
251 getMaxWavesPerEU(FlatWorkGroupSizes.second);
252 bool RequestedFlatWorkGroupSize = false;
254 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
255 // starts using "amdgpu-flat-work-group-size" attribute.
256 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
257 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
258 Default.first = MinImpliedByFlatWorkGroupSize;
259 RequestedFlatWorkGroupSize = true;
262 // Requested minimum/maximum number of waves per execution unit.
263 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
264 F, "amdgpu-waves-per-eu", Default, true);
266 // Make sure requested minimum is less than requested maximum.
267 if (Requested.second && Requested.first > Requested.second)
270 // Make sure requested values do not violate subtarget's specifications.
271 if (Requested.first < getMinWavesPerEU() ||
272 Requested.first > getMaxWavesPerEU())
274 if (Requested.second > getMaxWavesPerEU())
277 // Make sure requested values are compatible with values implied by requested
278 // minimum/maximum flat work group sizes.
279 if (RequestedFlatWorkGroupSize &&
280 Requested.first < MinImpliedByFlatWorkGroupSize)
286 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
287 Function *Kernel = I->getParent()->getParent();
288 unsigned MinSize = 0;
289 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
290 bool IdQuery = false;
292 // If reqd_work_group_size is present it narrows value down.
293 if (auto *CI = dyn_cast<CallInst>(I)) {
294 const Function *F = CI->getCalledFunction();
296 unsigned Dim = UINT_MAX;
297 switch (F->getIntrinsicID()) {
298 case Intrinsic::amdgcn_workitem_id_x:
299 case Intrinsic::r600_read_tidig_x:
302 case Intrinsic::r600_read_local_size_x:
305 case Intrinsic::amdgcn_workitem_id_y:
306 case Intrinsic::r600_read_tidig_y:
309 case Intrinsic::r600_read_local_size_y:
312 case Intrinsic::amdgcn_workitem_id_z:
313 case Intrinsic::r600_read_tidig_z:
316 case Intrinsic::r600_read_local_size_z:
323 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
324 if (Node->getNumOperands() == 3)
325 MinSize = MaxSize = mdconst::extract<ConstantInt>(
326 Node->getOperand(Dim))->getZExtValue();
334 // Range metadata is [Lo, Hi). For ID query we need to pass max size
335 // as Hi. For size query we need to pass Hi + 1.
341 MDBuilder MDB(I->getContext());
342 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
344 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
348 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
349 const TargetMachine &TM) :
350 AMDGPUSubtarget(TT, GPU, FS, TM),
352 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
355 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
356 const TargetMachine &TM)
357 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
358 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
360 #ifndef LLVM_BUILD_GLOBAL_ISEL
361 GISelAccessor *GISel = new GISelAccessor();
363 SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
364 GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
365 GISel->Legalizer.reset(new AMDGPULegalizerInfo());
367 GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
368 GISel->InstSelector.reset(new AMDGPUInstructionSelector(
369 *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get())));
371 setGISelAccessor(*GISel);
374 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
375 unsigned NumRegionInstrs) const {
376 // Track register pressure so the scheduler can try to decrease
377 // pressure once register usage is above the threshold defined by
378 // SIRegisterInfo::getRegPressureSetLimit()
379 Policy.ShouldTrackPressure = true;
381 // Enabling both top down and bottom up scheduling seems to give us less
382 // register spills than just using one of these approaches on its own.
383 Policy.OnlyTopDown = false;
384 Policy.OnlyBottomUp = false;
386 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
387 if (!enableSIScheduler())
388 Policy.ShouldTrackLaneMasks = true;
391 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
392 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
395 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
396 unsigned ExplicitArgBytes) const {
397 unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
398 if (ImplicitBytes == 0)
399 return ExplicitArgBytes;
401 unsigned Alignment = getAlignmentForImplicitArgPtr();
402 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
405 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
406 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
428 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
450 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
451 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
452 if (MFI.hasFlatScratchInit()) {
453 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
454 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
455 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
456 return 4; // FLAT_SCRATCH, VCC (in that order).
459 if (isXNACKEnabled())
460 return 4; // XNACK, VCC (in that order).
464 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
465 const Function &F = *MF.getFunction();
466 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
468 // Compute maximum number of SGPRs function can use using default/requested
469 // minimum number of waves per execution unit.
470 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
471 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
472 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
474 // Check if maximum number of SGPRs was explicitly requested using
475 // "amdgpu-num-sgpr" attribute.
476 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
477 unsigned Requested = AMDGPU::getIntegerAttribute(
478 F, "amdgpu-num-sgpr", MaxNumSGPRs);
480 // Make sure requested value does not violate subtarget's specifications.
481 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
484 // If more SGPRs are required to support the input user/system SGPRs,
485 // increase to accommodate them.
487 // FIXME: This really ends up using the requested number of SGPRs + number
488 // of reserved special registers in total. Theoretically you could re-use
489 // the last input registers for these special registers, but this would
490 // require a lot of complexity to deal with the weird aliasing.
491 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
492 if (Requested && Requested < InputNumSGPRs)
493 Requested = InputNumSGPRs;
495 // Make sure requested value is compatible with values implied by
496 // default/requested minimum/maximum number of waves per execution unit.
497 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
499 if (WavesPerEU.second &&
500 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
504 MaxNumSGPRs = Requested;
507 if (hasSGPRInitBug())
508 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
510 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
511 MaxAddressableNumSGPRs);
514 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
515 const Function &F = *MF.getFunction();
516 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
518 // Compute maximum number of VGPRs function can use using default/requested
519 // minimum number of waves per execution unit.
520 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
521 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
523 // Check if maximum number of VGPRs was explicitly requested using
524 // "amdgpu-num-vgpr" attribute.
525 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
526 unsigned Requested = AMDGPU::getIntegerAttribute(
527 F, "amdgpu-num-vgpr", MaxNumVGPRs);
529 // Make sure requested value does not violate subtarget's specifications.
530 if (Requested && Requested <= getReservedNumVGPRs(MF))
533 // Make sure requested value is compatible with values implied by
534 // default/requested minimum/maximum number of waves per execution unit.
535 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
537 if (WavesPerEU.second &&
538 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
542 MaxNumVGPRs = Requested;
545 return MaxNumVGPRs - getReservedNumVGPRs(MF);