1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
31 #define DEBUG_TYPE "amdgpu-subtarget"
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41 StringRef GPU, StringRef FS) {
42 // Determine default and user-specified characteristics
43 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44 // enabled, but some instructions do not respect them and they run at the
45 // double precision rate, so don't enable by default.
47 // We want to be able to turn these off, but making this a subtarget feature
48 // for SI has the unhelpful behavior that it unsets everything else if you
51 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
53 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
54 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
56 // FIXME: I don't think think Evergreen has any useful support for
57 // denormals, but should be checked. Should we issue a warning somewhere
58 // if someone tries to enable these?
59 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
60 FullFS += "+fp64-fp16-denormals,";
62 FullFS += "-fp32-denormals,";
67 ParseSubtargetFeatures(GPU, FullFS);
69 // We don't support FP64 for EG/NI atm.
70 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
72 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
73 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
74 // variants of MUBUF instructions.
75 if (!hasAddr64() && !FS.contains("flat-for-global")) {
79 // Set defaults if needed.
80 if (MaxPrivateElementSize == 0)
81 MaxPrivateElementSize = 4;
83 if (LDSBankCount == 0)
86 if (TT.getArch() == Triple::amdgcn) {
87 if (LocalMemorySize == 0)
88 LocalMemorySize = 32768;
90 // Do something sensible for unspecified target.
91 if (!HasMovrel && !HasVGPRIndexMode)
98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
99 const TargetMachine &TM)
100 : AMDGPUGenSubtargetInfo(TT, GPU, FS),
102 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
103 IsaVersion(ISAVersion0_0_0),
107 MaxPrivateElementSize(0),
110 HalfRate64Ops(false),
112 FP32Denormals(false),
113 FP64FP16Denormals(false),
116 FlatForGlobal(false),
117 AutoWaitcntBeforeBarrier(false),
119 UnalignedScratchAccess(false),
120 UnalignedBufferAccess(false),
122 HasApertureRegs(false),
125 DebuggerInsertNops(false),
126 DebuggerReserveRegs(false),
127 DebuggerEmitPrologue(false),
129 EnableHugePrivateBuffer(false),
130 EnableVGPRSpilling(false),
131 EnablePromoteAlloca(false),
132 EnableLoadStoreOpt(false),
133 EnableUnsafeDSOffsetFolding(false),
134 EnableSIScheduler(false),
144 HasSMemRealTime(false),
145 Has16BitInsts(false),
147 HasVOP3PInsts(false),
148 HasMadMixInsts(false),
150 HasVGPRIndexMode(false),
151 HasScalarStores(false),
152 HasInv2PiInlineImm(false),
155 HasSDWAScalar(false),
158 HasSDWAOutModsVOPC(false),
160 FlatAddressSpace(false),
161 FlatInstOffsets(false),
162 FlatGlobalInsts(false),
163 FlatScratchInsts(false),
164 AddNoCarryInsts(false),
169 HasVertexCache(false),
171 ScalarizeGlobal(false),
173 FeatureDisable(false),
174 InstrItins(getInstrItineraryForCPU(GPU)) {
175 AS = AMDGPU::getAMDGPUAS(TT);
176 initializeSubtargetDependencies(TT, GPU, FS);
179 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
180 const Function &F) const {
182 return getLocalMemorySize();
183 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
184 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
185 unsigned MaxWaves = getMaxWavesPerEU();
186 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
189 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
190 const Function &F) const {
191 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
192 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
193 unsigned MaxWaves = getMaxWavesPerEU();
194 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
195 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
196 NumWaves = std::min(NumWaves, MaxWaves);
197 NumWaves = std::max(NumWaves, 1u);
201 std::pair<unsigned, unsigned>
202 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
204 case CallingConv::AMDGPU_CS:
205 case CallingConv::AMDGPU_KERNEL:
206 case CallingConv::SPIR_KERNEL:
207 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
208 case CallingConv::AMDGPU_VS:
209 case CallingConv::AMDGPU_LS:
210 case CallingConv::AMDGPU_HS:
211 case CallingConv::AMDGPU_ES:
212 case CallingConv::AMDGPU_GS:
213 case CallingConv::AMDGPU_PS:
214 return std::make_pair(1, getWavefrontSize());
216 return std::make_pair(1, 16 * getWavefrontSize());
220 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
221 const Function &F) const {
222 // FIXME: 1024 if function.
223 // Default minimum/maximum flat work group sizes.
224 std::pair<unsigned, unsigned> Default =
225 getDefaultFlatWorkGroupSize(F.getCallingConv());
227 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
228 // starts using "amdgpu-flat-work-group-size" attribute.
229 Default.second = AMDGPU::getIntegerAttribute(
230 F, "amdgpu-max-work-group-size", Default.second);
231 Default.first = std::min(Default.first, Default.second);
233 // Requested minimum/maximum flat work group sizes.
234 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
235 F, "amdgpu-flat-work-group-size", Default);
237 // Make sure requested minimum is less than requested maximum.
238 if (Requested.first > Requested.second)
241 // Make sure requested values do not violate subtarget's specifications.
242 if (Requested.first < getMinFlatWorkGroupSize())
244 if (Requested.second > getMaxFlatWorkGroupSize())
250 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
251 const Function &F) const {
252 // Default minimum/maximum number of waves per execution unit.
253 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
255 // Default/requested minimum/maximum flat work group sizes.
256 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
258 // If minimum/maximum flat work group sizes were explicitly requested using
259 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
260 // number of waves per execution unit to values implied by requested
261 // minimum/maximum flat work group sizes.
262 unsigned MinImpliedByFlatWorkGroupSize =
263 getMaxWavesPerEU(FlatWorkGroupSizes.second);
264 bool RequestedFlatWorkGroupSize = false;
266 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
267 // starts using "amdgpu-flat-work-group-size" attribute.
268 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
269 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
270 Default.first = MinImpliedByFlatWorkGroupSize;
271 RequestedFlatWorkGroupSize = true;
274 // Requested minimum/maximum number of waves per execution unit.
275 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
276 F, "amdgpu-waves-per-eu", Default, true);
278 // Make sure requested minimum is less than requested maximum.
279 if (Requested.second && Requested.first > Requested.second)
282 // Make sure requested values do not violate subtarget's specifications.
283 if (Requested.first < getMinWavesPerEU() ||
284 Requested.first > getMaxWavesPerEU())
286 if (Requested.second > getMaxWavesPerEU())
289 // Make sure requested values are compatible with values implied by requested
290 // minimum/maximum flat work group sizes.
291 if (RequestedFlatWorkGroupSize &&
292 Requested.first < MinImpliedByFlatWorkGroupSize)
298 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
299 Function *Kernel = I->getParent()->getParent();
300 unsigned MinSize = 0;
301 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
302 bool IdQuery = false;
304 // If reqd_work_group_size is present it narrows value down.
305 if (auto *CI = dyn_cast<CallInst>(I)) {
306 const Function *F = CI->getCalledFunction();
308 unsigned Dim = UINT_MAX;
309 switch (F->getIntrinsicID()) {
310 case Intrinsic::amdgcn_workitem_id_x:
311 case Intrinsic::r600_read_tidig_x:
314 case Intrinsic::r600_read_local_size_x:
317 case Intrinsic::amdgcn_workitem_id_y:
318 case Intrinsic::r600_read_tidig_y:
321 case Intrinsic::r600_read_local_size_y:
324 case Intrinsic::amdgcn_workitem_id_z:
325 case Intrinsic::r600_read_tidig_z:
328 case Intrinsic::r600_read_local_size_z:
335 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
336 if (Node->getNumOperands() == 3)
337 MinSize = MaxSize = mdconst::extract<ConstantInt>(
338 Node->getOperand(Dim))->getZExtValue();
346 // Range metadata is [Lo, Hi). For ID query we need to pass max size
347 // as Hi. For size query we need to pass Hi + 1.
353 MDBuilder MDB(I->getContext());
354 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
356 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
360 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
361 const TargetMachine &TM) :
362 AMDGPUSubtarget(TT, GPU, FS, TM),
364 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
367 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
368 const TargetMachine &TM)
369 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
370 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
372 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
373 Legalizer.reset(new AMDGPULegalizerInfo());
375 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
376 InstSelector.reset(new AMDGPUInstructionSelector(
377 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
380 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
381 unsigned NumRegionInstrs) const {
382 // Track register pressure so the scheduler can try to decrease
383 // pressure once register usage is above the threshold defined by
384 // SIRegisterInfo::getRegPressureSetLimit()
385 Policy.ShouldTrackPressure = true;
387 // Enabling both top down and bottom up scheduling seems to give us less
388 // register spills than just using one of these approaches on its own.
389 Policy.OnlyTopDown = false;
390 Policy.OnlyBottomUp = false;
392 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
393 if (!enableSIScheduler())
394 Policy.ShouldTrackLaneMasks = true;
397 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
398 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
401 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
402 unsigned ExplicitArgBytes) const {
403 unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
404 if (ImplicitBytes == 0)
405 return ExplicitArgBytes;
407 unsigned Alignment = getAlignmentForImplicitArgPtr();
408 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
411 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
412 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
434 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
456 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
457 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
458 if (MFI.hasFlatScratchInit()) {
459 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
460 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
461 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
462 return 4; // FLAT_SCRATCH, VCC (in that order).
465 if (isXNACKEnabled())
466 return 4; // XNACK, VCC (in that order).
470 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
471 const Function &F = MF.getFunction();
472 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
474 // Compute maximum number of SGPRs function can use using default/requested
475 // minimum number of waves per execution unit.
476 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
477 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
478 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
480 // Check if maximum number of SGPRs was explicitly requested using
481 // "amdgpu-num-sgpr" attribute.
482 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
483 unsigned Requested = AMDGPU::getIntegerAttribute(
484 F, "amdgpu-num-sgpr", MaxNumSGPRs);
486 // Make sure requested value does not violate subtarget's specifications.
487 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
490 // If more SGPRs are required to support the input user/system SGPRs,
491 // increase to accommodate them.
493 // FIXME: This really ends up using the requested number of SGPRs + number
494 // of reserved special registers in total. Theoretically you could re-use
495 // the last input registers for these special registers, but this would
496 // require a lot of complexity to deal with the weird aliasing.
497 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
498 if (Requested && Requested < InputNumSGPRs)
499 Requested = InputNumSGPRs;
501 // Make sure requested value is compatible with values implied by
502 // default/requested minimum/maximum number of waves per execution unit.
503 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
505 if (WavesPerEU.second &&
506 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
510 MaxNumSGPRs = Requested;
513 if (hasSGPRInitBug())
514 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
516 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
517 MaxAddressableNumSGPRs);
520 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
521 const Function &F = MF.getFunction();
522 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
524 // Compute maximum number of VGPRs function can use using default/requested
525 // minimum number of waves per execution unit.
526 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
527 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
529 // Check if maximum number of VGPRs was explicitly requested using
530 // "amdgpu-num-vgpr" attribute.
531 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
532 unsigned Requested = AMDGPU::getIntegerAttribute(
533 F, "amdgpu-num-vgpr", MaxNumVGPRs);
535 // Make sure requested value does not violate subtarget's specifications.
536 if (Requested && Requested <= getReservedNumVGPRs(MF))
539 // Make sure requested value is compatible with values implied by
540 // default/requested minimum/maximum number of waves per execution unit.
541 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
543 if (WavesPerEU.second &&
544 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
548 MaxNumVGPRs = Requested;
551 return MaxNumVGPRs - getReservedNumVGPRs(MF);
555 struct MemOpClusterMutation : ScheduleDAGMutation {
556 const SIInstrInfo *TII;
558 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
560 void apply(ScheduleDAGInstrs *DAGInstrs) override {
561 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
563 SUnit *SUa = nullptr;
564 // Search for two consequent memory operations and link them
565 // to prevent scheduler from moving them apart.
566 // In DAG pre-process SUnits are in the original order of
567 // the instructions before scheduling.
568 for (SUnit &SU : DAG->SUnits) {
569 MachineInstr &MI2 = *SU.getInstr();
570 if (!MI2.mayLoad() && !MI2.mayStore()) {
579 MachineInstr &MI1 = *SUa->getInstr();
580 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
581 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
582 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
583 (TII->isDS(MI1) && TII->isDS(MI2))) {
584 SU.addPredBarrier(SUa);
586 for (const SDep &SI : SU.Preds) {
587 if (SI.getSUnit() != SUa)
588 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
591 if (&SU != &DAG->ExitSU) {
592 for (const SDep &SI : SUa->Succs) {
593 if (SI.getSUnit() != &SU)
594 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
605 void SISubtarget::getPostRAMutations(
606 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
607 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));