1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
33 #define DEBUG_TYPE "amdgpu-subtarget"
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
44 GCNSubtarget::~GCNSubtarget() = default;
47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48 StringRef GPU, StringRef FS) {
49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
51 ParseSubtargetFeatures(GPU, FullFS);
53 // FIXME: I don't think think Evergreen has any useful support for
54 // denormals, but should be checked. Should we issue a warning somewhere
55 // if someone tries to enable these?
56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57 FP32Denormals = false;
60 HasMulU24 = getGeneration() >= EVERGREEN;
61 HasMulI24 = hasCaymanISA();
67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68 StringRef GPU, StringRef FS) {
69 // Determine default and user-specified characteristics
70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71 // enabled, but some instructions do not respect them and they run at the
72 // double precision rate, so don't enable by default.
74 // We want to be able to turn these off, but making this a subtarget feature
75 // for SI has the unhelpful behavior that it unsets everything else if you
78 // Similarly we want enable-prt-strict-null to be on by default and not to
79 // unset everything else if it is disabled
81 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
86 // FIXME: I don't think think Evergreen has any useful support for
87 // denormals, but should be checked. Should we issue a warning somewhere
88 // if someone tries to enable these?
89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90 FullFS += "+fp64-fp16-denormals,";
92 FullFS += "-fp32-denormals,";
95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 ParseSubtargetFeatures(GPU, FullFS);
101 // We don't support FP64 for EG/NI atm.
102 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106 // variants of MUBUF instructions.
107 if (!hasAddr64() && !FS.contains("flat-for-global")) {
108 FlatForGlobal = true;
111 // Set defaults if needed.
112 if (MaxPrivateElementSize == 0)
113 MaxPrivateElementSize = 4;
115 if (LDSBankCount == 0)
118 if (TT.getArch() == Triple::amdgcn) {
119 if (LocalMemorySize == 0)
120 LocalMemorySize = 32768;
122 // Do something sensible for unspecified target.
123 if (!HasMovrel && !HasVGPRIndexMode)
127 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
132 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
134 Has16BitInsts(false),
135 HasMadMixInsts(false),
136 FP32Denormals(false),
139 HasVOP3PInsts(false),
142 HasInv2PiInlineImm(false),
143 HasFminFmaxLegacy(true),
144 EnablePromoteAlloca(false),
145 HasTrigReducedRange(false),
150 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
151 const GCNTargetMachine &TM) :
152 AMDGPUGenSubtargetInfo(TT, GPU, FS),
155 Gen(SOUTHERN_ISLANDS),
156 IsaVersion(ISAVersion0_0_0),
157 InstrItins(getInstrItineraryForCPU(GPU)),
159 MaxPrivateElementSize(0),
162 HalfRate64Ops(false),
164 FP64FP16Denormals(false),
166 FlatForGlobal(false),
167 AutoWaitcntBeforeBarrier(false),
169 UnalignedScratchAccess(false),
170 UnalignedBufferAccess(false),
172 HasApertureRegs(false),
175 DebuggerInsertNops(false),
176 DebuggerEmitPrologue(false),
178 EnableHugePrivateBuffer(false),
179 EnableLoadStoreOpt(false),
180 EnableUnsafeDSOffsetFolding(false),
181 EnableSIScheduler(false),
183 EnablePRTStrictNull(false),
192 HasSMemRealTime(false),
194 HasFmaMixInsts(false),
196 HasVGPRIndexMode(false),
197 HasScalarStores(false),
198 HasScalarAtomics(false),
200 HasSDWAScalar(false),
203 HasSDWAOutModsVOPC(false),
208 EnableSRAMECC(false),
209 FlatAddressSpace(false),
210 FlatInstOffsets(false),
211 FlatGlobalInsts(false),
212 FlatScratchInsts(false),
213 AddNoCarryInsts(false),
214 HasUnpackedD16VMem(false),
216 ScalarizeGlobal(false),
218 FeatureDisable(false),
219 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
221 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
222 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
223 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
224 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
225 InstSelector.reset(new AMDGPUInstructionSelector(
226 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
229 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
230 const Function &F) const {
232 return getLocalMemorySize();
233 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
234 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
235 unsigned MaxWaves = getMaxWavesPerEU();
236 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
239 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
240 const Function &F) const {
241 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
242 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
243 unsigned MaxWaves = getMaxWavesPerEU();
244 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
245 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
246 NumWaves = std::min(NumWaves, MaxWaves);
247 NumWaves = std::max(NumWaves, 1u);
252 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
253 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
254 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
257 std::pair<unsigned, unsigned>
258 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
260 case CallingConv::AMDGPU_CS:
261 case CallingConv::AMDGPU_KERNEL:
262 case CallingConv::SPIR_KERNEL:
263 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
264 case CallingConv::AMDGPU_VS:
265 case CallingConv::AMDGPU_LS:
266 case CallingConv::AMDGPU_HS:
267 case CallingConv::AMDGPU_ES:
268 case CallingConv::AMDGPU_GS:
269 case CallingConv::AMDGPU_PS:
270 return std::make_pair(1, getWavefrontSize());
272 return std::make_pair(1, 16 * getWavefrontSize());
276 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
277 const Function &F) const {
278 // FIXME: 1024 if function.
279 // Default minimum/maximum flat work group sizes.
280 std::pair<unsigned, unsigned> Default =
281 getDefaultFlatWorkGroupSize(F.getCallingConv());
283 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
284 // starts using "amdgpu-flat-work-group-size" attribute.
285 Default.second = AMDGPU::getIntegerAttribute(
286 F, "amdgpu-max-work-group-size", Default.second);
287 Default.first = std::min(Default.first, Default.second);
289 // Requested minimum/maximum flat work group sizes.
290 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
291 F, "amdgpu-flat-work-group-size", Default);
293 // Make sure requested minimum is less than requested maximum.
294 if (Requested.first > Requested.second)
297 // Make sure requested values do not violate subtarget's specifications.
298 if (Requested.first < getMinFlatWorkGroupSize())
300 if (Requested.second > getMaxFlatWorkGroupSize())
306 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
307 const Function &F) const {
308 // Default minimum/maximum number of waves per execution unit.
309 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
311 // Default/requested minimum/maximum flat work group sizes.
312 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
314 // If minimum/maximum flat work group sizes were explicitly requested using
315 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
316 // number of waves per execution unit to values implied by requested
317 // minimum/maximum flat work group sizes.
318 unsigned MinImpliedByFlatWorkGroupSize =
319 getMaxWavesPerEU(FlatWorkGroupSizes.second);
320 bool RequestedFlatWorkGroupSize = false;
322 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
323 // starts using "amdgpu-flat-work-group-size" attribute.
324 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
325 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
326 Default.first = MinImpliedByFlatWorkGroupSize;
327 RequestedFlatWorkGroupSize = true;
330 // Requested minimum/maximum number of waves per execution unit.
331 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
332 F, "amdgpu-waves-per-eu", Default, true);
334 // Make sure requested minimum is less than requested maximum.
335 if (Requested.second && Requested.first > Requested.second)
338 // Make sure requested values do not violate subtarget's specifications.
339 if (Requested.first < getMinWavesPerEU() ||
340 Requested.first > getMaxWavesPerEU())
342 if (Requested.second > getMaxWavesPerEU())
345 // Make sure requested values are compatible with values implied by requested
346 // minimum/maximum flat work group sizes.
347 if (RequestedFlatWorkGroupSize &&
348 Requested.first < MinImpliedByFlatWorkGroupSize)
354 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
355 Function *Kernel = I->getParent()->getParent();
356 unsigned MinSize = 0;
357 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
358 bool IdQuery = false;
360 // If reqd_work_group_size is present it narrows value down.
361 if (auto *CI = dyn_cast<CallInst>(I)) {
362 const Function *F = CI->getCalledFunction();
364 unsigned Dim = UINT_MAX;
365 switch (F->getIntrinsicID()) {
366 case Intrinsic::amdgcn_workitem_id_x:
367 case Intrinsic::r600_read_tidig_x:
370 case Intrinsic::r600_read_local_size_x:
373 case Intrinsic::amdgcn_workitem_id_y:
374 case Intrinsic::r600_read_tidig_y:
377 case Intrinsic::r600_read_local_size_y:
380 case Intrinsic::amdgcn_workitem_id_z:
381 case Intrinsic::r600_read_tidig_z:
384 case Intrinsic::r600_read_local_size_z:
391 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
392 if (Node->getNumOperands() == 3)
393 MinSize = MaxSize = mdconst::extract<ConstantInt>(
394 Node->getOperand(Dim))->getZExtValue();
402 // Range metadata is [Lo, Hi). For ID query we need to pass max size
403 // as Hi. For size query we need to pass Hi + 1.
409 MDBuilder MDB(I->getContext());
410 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
412 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
416 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
417 unsigned &MaxAlign) const {
418 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
419 F.getCallingConv() == CallingConv::SPIR_KERNEL);
421 const DataLayout &DL = F.getParent()->getDataLayout();
422 uint64_t ExplicitArgBytes = 0;
425 for (const Argument &Arg : F.args()) {
426 Type *ArgTy = Arg.getType();
428 unsigned Align = DL.getABITypeAlignment(ArgTy);
429 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
430 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
431 MaxAlign = std::max(MaxAlign, Align);
434 return ExplicitArgBytes;
437 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
438 unsigned &MaxAlign) const {
439 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
441 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
443 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
444 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
445 if (ImplicitBytes != 0) {
446 unsigned Alignment = getAlignmentForImplicitArgPtr();
447 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
450 // Being able to dereference past the end is useful for emitting scalar loads.
451 return alignTo(TotalSize, 4);
454 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
455 const TargetMachine &TM) :
456 R600GenSubtargetInfo(TT, GPU, FS),
459 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
464 HasVertexCache(false),
469 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
470 InstrItins(getInstrItineraryForCPU(GPU)) { }
472 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
473 unsigned NumRegionInstrs) const {
474 // Track register pressure so the scheduler can try to decrease
475 // pressure once register usage is above the threshold defined by
476 // SIRegisterInfo::getRegPressureSetLimit()
477 Policy.ShouldTrackPressure = true;
479 // Enabling both top down and bottom up scheduling seems to give us less
480 // register spills than just using one of these approaches on its own.
481 Policy.OnlyTopDown = false;
482 Policy.OnlyBottomUp = false;
484 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
485 if (!enableSIScheduler())
486 Policy.ShouldTrackLaneMasks = true;
489 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
490 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
512 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
534 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
535 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
536 if (MFI.hasFlatScratchInit()) {
537 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
538 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
539 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
540 return 4; // FLAT_SCRATCH, VCC (in that order).
543 if (isXNACKEnabled())
544 return 4; // XNACK, VCC (in that order).
548 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
549 const Function &F = MF.getFunction();
550 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
552 // Compute maximum number of SGPRs function can use using default/requested
553 // minimum number of waves per execution unit.
554 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
555 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
556 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
558 // Check if maximum number of SGPRs was explicitly requested using
559 // "amdgpu-num-sgpr" attribute.
560 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
561 unsigned Requested = AMDGPU::getIntegerAttribute(
562 F, "amdgpu-num-sgpr", MaxNumSGPRs);
564 // Make sure requested value does not violate subtarget's specifications.
565 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
568 // If more SGPRs are required to support the input user/system SGPRs,
569 // increase to accommodate them.
571 // FIXME: This really ends up using the requested number of SGPRs + number
572 // of reserved special registers in total. Theoretically you could re-use
573 // the last input registers for these special registers, but this would
574 // require a lot of complexity to deal with the weird aliasing.
575 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
576 if (Requested && Requested < InputNumSGPRs)
577 Requested = InputNumSGPRs;
579 // Make sure requested value is compatible with values implied by
580 // default/requested minimum/maximum number of waves per execution unit.
581 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
583 if (WavesPerEU.second &&
584 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
588 MaxNumSGPRs = Requested;
591 if (hasSGPRInitBug())
592 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
594 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
595 MaxAddressableNumSGPRs);
598 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
599 const Function &F = MF.getFunction();
600 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
602 // Compute maximum number of VGPRs function can use using default/requested
603 // minimum number of waves per execution unit.
604 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
605 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
607 // Check if maximum number of VGPRs was explicitly requested using
608 // "amdgpu-num-vgpr" attribute.
609 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
610 unsigned Requested = AMDGPU::getIntegerAttribute(
611 F, "amdgpu-num-vgpr", MaxNumVGPRs);
613 // Make sure requested value is compatible with values implied by
614 // default/requested minimum/maximum number of waves per execution unit.
615 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
617 if (WavesPerEU.second &&
618 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
622 MaxNumVGPRs = Requested;
629 struct MemOpClusterMutation : ScheduleDAGMutation {
630 const SIInstrInfo *TII;
632 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
634 void apply(ScheduleDAGInstrs *DAGInstrs) override {
635 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
637 SUnit *SUa = nullptr;
638 // Search for two consequent memory operations and link them
639 // to prevent scheduler from moving them apart.
640 // In DAG pre-process SUnits are in the original order of
641 // the instructions before scheduling.
642 for (SUnit &SU : DAG->SUnits) {
643 MachineInstr &MI2 = *SU.getInstr();
644 if (!MI2.mayLoad() && !MI2.mayStore()) {
653 MachineInstr &MI1 = *SUa->getInstr();
654 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
655 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
656 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
657 (TII->isDS(MI1) && TII->isDS(MI2))) {
658 SU.addPredBarrier(SUa);
660 for (const SDep &SI : SU.Preds) {
661 if (SI.getSUnit() != SUa)
662 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
665 if (&SU != &DAG->ExitSU) {
666 for (const SDep &SI : SUa->Succs) {
667 if (SI.getSUnit() != &SU)
668 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
679 void GCNSubtarget::getPostRAMutations(
680 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
681 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
685 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
686 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
688 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
691 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
692 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
693 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
695 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));